]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Make decode_video_format() private.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <sched.h>
17 #include <stdint.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #if HAS_MULTIVERSIONING
22 #include <immintrin.h>
23 #endif
24 #include "bmusb.h"
25
26 #include <algorithm>
27 #include <atomic>
28 #include <condition_variable>
29 #include <cstddef>
30 #include <cstdint>
31 #include <deque>
32 #include <functional>
33 #include <memory>
34 #include <mutex>
35 #include <stack>
36 #include <string>
37 #include <thread>
38
39 using namespace std;
40 using namespace std::placeholders;
41
42 #define USB_VENDOR_BLACKMAGIC 0x1edb
43 #define MIN_WIDTH 640
44 #define HEADER_SIZE 44
45 //#define HEADER_SIZE 0
46 #define AUDIO_HEADER_SIZE 4
47
48 #define FRAME_SIZE (8 << 20)  // 8 MB.
49 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
50
51 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
52
53 namespace {
54
55 FILE *audiofp;
56
57 thread usb_thread;
58 atomic<bool> should_quit;
59
60 int find_xfer_size_for_width(int width)
61 {
62         // Video seems to require isochronous packets scaled with the width;
63         // seemingly six lines is about right, rounded up to the required 1kB
64         // multiple.
65         int size = width * 2 * 6;
66         // Note that for 10-bit input, you'll need to increase size accordingly.
67         //size = size * 4 / 3;
68         if (size % 1024 != 0) {
69                 size &= ~1023;
70                 size += 1024;
71         }
72         return size;
73 }
74
75 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
76 {
77         assert(width >= MIN_WIDTH);
78         size_t size = find_xfer_size_for_width(width);
79         int num_iso_pack = xfr->length / size;
80         if (num_iso_pack != xfr->num_iso_packets ||
81             size != xfr->iso_packet_desc[0].length) {
82                 xfr->num_iso_packets = num_iso_pack;
83                 libusb_set_iso_packet_lengths(xfr, size);
84         }
85 }
86
87 struct VideoFormatEntry {
88         uint16_t normalized_video_format;
89         unsigned width, height, second_field_start;
90         unsigned extra_lines_top, extra_lines_bottom;
91         unsigned frame_rate_nom, frame_rate_den;
92         bool interlaced;
93 };
94
95 // Get details for the given video format; returns false if detection was incomplete.
96 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
97 {
98         decoded_video_format->id = video_format;
99         decoded_video_format->interlaced = false;
100
101         // TODO: Add these for all formats as we find them.
102         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
103
104         if (video_format == 0x0800) {
105                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
106                 // It's a strange thing, but what can you do.
107                 decoded_video_format->width = 720;
108                 decoded_video_format->height = 525;
109                 decoded_video_format->extra_lines_top = 0;
110                 decoded_video_format->extra_lines_bottom = 0;
111                 decoded_video_format->frame_rate_nom = 3013;
112                 decoded_video_format->frame_rate_den = 100;
113                 decoded_video_format->has_signal = false;
114                 return true;
115         }
116         if ((video_format & 0xe800) != 0xe800) {
117                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
118                         video_format);
119                 decoded_video_format->width = 0;
120                 decoded_video_format->height = 0;
121                 decoded_video_format->extra_lines_top = 0;
122                 decoded_video_format->extra_lines_bottom = 0;
123                 decoded_video_format->frame_rate_nom = 60;
124                 decoded_video_format->frame_rate_den = 1;
125                 decoded_video_format->has_signal = false;
126                 return false;
127         }
128
129         decoded_video_format->has_signal = true;
130
131         // NTSC (480i59.94, I suppose). A special case, see below.
132         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
133                 decoded_video_format->width = 720;
134                 decoded_video_format->height = 480;
135                 decoded_video_format->extra_lines_top = 17;
136                 decoded_video_format->extra_lines_bottom = 28;
137                 decoded_video_format->frame_rate_nom = 30000;
138                 decoded_video_format->frame_rate_den = 1001;
139                 decoded_video_format->second_field_start = 280;
140                 decoded_video_format->interlaced = true;
141                 return true;
142         }
143
144         // PAL (576i50, I suppose). A special case, see below.
145         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
146                 decoded_video_format->width = 720;
147                 decoded_video_format->height = 576;
148                 decoded_video_format->extra_lines_top = 22;
149                 decoded_video_format->extra_lines_bottom = 27;
150                 decoded_video_format->frame_rate_nom = 25;
151                 decoded_video_format->frame_rate_den = 1;
152                 decoded_video_format->second_field_start = 335;
153                 decoded_video_format->interlaced = true;
154                 return true;
155         }
156
157         // 0x8 seems to be a flag about availability of deep color on the input,
158         // except when it's not (e.g. it's the only difference between NTSC
159         // and PAL). Rather confusing. But we clear it here nevertheless, because
160         // usually it doesn't mean anything.
161         //
162         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
163         uint16_t normalized_video_format = video_format & ~0xe80c;
164         constexpr VideoFormatEntry entries[] = {
165                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
166                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
167                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
168                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
169                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
170                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
171                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
172                 { 0x01c3, 1920, 1080,   0,  0,  0,    30,    1, false },  // 1080p30.
173                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
174                 { 0x01e1, 1920, 1080,   0,  0,  0, 30000, 1001, false },  // 1080p29.97.
175                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
176                 { 0x0063, 1920, 1080,   0,  0,  0,    25,    1, false },  // 1080p25.
177                 { 0x0043, 1920, 1080,   0,  0,  0,    25,    1,  true },  // 1080p50.
178                 { 0x008e, 1920, 1080,   0,  0,  0,    24,    1, false },  // 1080p24.
179                 { 0x00a1, 1920, 1080,   0,  0,  0, 24000, 1001, false },  // 1080p23.98.
180         };
181         for (const VideoFormatEntry &entry : entries) {
182                 if (normalized_video_format == entry.normalized_video_format) {
183                         decoded_video_format->width = entry.width;
184                         decoded_video_format->height = entry.height;
185                         decoded_video_format->second_field_start = entry.second_field_start;
186                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
187                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
188                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
189                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
190                         decoded_video_format->interlaced = entry.interlaced;
191                         return true;
192                 }
193         }
194
195         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
196         decoded_video_format->width = 1280;
197         decoded_video_format->height = 720;
198         decoded_video_format->frame_rate_nom = 60;
199         decoded_video_format->frame_rate_den = 1;
200         return false;
201 }
202
203 }  // namespace
204
205 FrameAllocator::~FrameAllocator() {}
206
207 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
208         : frame_size(frame_size)
209 {
210         for (size_t i = 0; i < num_queued_frames; ++i) {
211                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
212         }
213 }
214
215 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
216 {
217         Frame vf;
218         vf.owner = this;
219
220         unique_lock<mutex> lock(freelist_mutex);  // Meh.
221         if (freelist.empty()) {
222                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
223                         frame_size);
224         } else {
225                 vf.data = freelist.top().release();
226                 vf.size = frame_size;
227                 freelist.pop();  // Meh.
228         }
229         return vf;
230 }
231
232 void MallocFrameAllocator::release_frame(Frame frame)
233 {
234         if (frame.overflow > 0) {
235                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
236         }
237         unique_lock<mutex> lock(freelist_mutex);
238         freelist.push(unique_ptr<uint8_t[]>(frame.data));
239 }
240
241 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
242 {
243         if (a == b) {
244                 return false;
245         } else if (a < b) {
246                 return (b - a < 0x8000);
247         } else {
248                 int wrap_b = 0x10000 + int(b);
249                 return (wrap_b - a < 0x8000);
250         }
251 }
252
253 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
254 {
255         unique_lock<mutex> lock(queue_lock);
256         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
257                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
258                         q->back().timecode, timecode);
259                 frame.owner->release_frame(frame);
260                 return;
261         }
262
263         QueuedFrame qf;
264         qf.format = format;
265         qf.timecode = timecode;
266         qf.frame = frame;
267         q->push_back(move(qf));
268         queues_not_empty.notify_one();  // might be spurious
269 }
270
271 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
272 {
273         FILE *fp = fopen(filename, "wb");
274         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
275                 printf("short write!\n");
276         }
277         fclose(fp);
278 }
279
280 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
281 {
282         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
283 }
284
285 void BMUSBCapture::dequeue_thread_func()
286 {
287         if (has_dequeue_callbacks) {
288                 dequeue_init_callback();
289         }
290         while (!dequeue_thread_should_quit) {
291                 unique_lock<mutex> lock(queue_lock);
292                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
293
294                 if (dequeue_thread_should_quit) break;
295
296                 uint16_t video_timecode = pending_video_frames.front().timecode;
297                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
298                 AudioFormat audio_format;
299                 audio_format.bits_per_sample = 24;
300                 audio_format.num_channels = 8;
301                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
302                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
303                                 video_timecode);
304                         QueuedFrame video_frame = pending_video_frames.front();
305                         pending_video_frames.pop_front();
306                         lock.unlock();
307                         video_frame_allocator->release_frame(video_frame.frame);
308                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
309                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
310                                 audio_timecode);
311                         QueuedFrame audio_frame = pending_audio_frames.front();
312                         pending_audio_frames.pop_front();
313                         lock.unlock();
314                         audio_format.id = audio_frame.format;
315
316                         // Use the video format of the pending frame.
317                         QueuedFrame video_frame = pending_video_frames.front();
318                         VideoFormat video_format;
319                         decode_video_format(video_frame.format, &video_format);
320
321                         frame_callback(audio_timecode,
322                                        FrameAllocator::Frame(), 0, video_format,
323                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
324                 } else {
325                         QueuedFrame video_frame = pending_video_frames.front();
326                         QueuedFrame audio_frame = pending_audio_frames.front();
327                         pending_audio_frames.pop_front();
328                         pending_video_frames.pop_front();
329                         lock.unlock();
330
331 #if 0
332                         char filename[255];
333                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
334                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
335                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
336 #endif
337
338                         VideoFormat video_format;
339                         audio_format.id = audio_frame.format;
340                         if (decode_video_format(video_frame.format, &video_format)) {
341                                 frame_callback(video_timecode,
342                                                video_frame.frame, HEADER_SIZE, video_format,
343                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
344                         } else {
345                                 frame_callback(video_timecode,
346                                                FrameAllocator::Frame(), 0, video_format,
347                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
348                         }
349                 }
350         }
351         if (has_dequeue_callbacks) {
352                 dequeue_cleanup_callback();
353         }
354 }
355
356 void BMUSBCapture::start_new_frame(const uint8_t *start)
357 {
358         uint16_t format = (start[3] << 8) | start[2];
359         uint16_t timecode = (start[1] << 8) | start[0];
360
361         if (current_video_frame.len > 0) {
362                 // If format is 0x0800 (no signal), add a fake (empty) audio
363                 // frame to get it out of the queue.
364                 // TODO: Figure out if there are other formats that come with
365                 // no audio, and treat them the same.
366                 if (format == 0x0800) {
367                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
368                         if (fake_audio_frame.data == nullptr) {
369                                 // Oh well, it's just a no-signal frame anyway.
370                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
371                                 current_video_frame.owner->release_frame(current_video_frame);
372                                 current_video_frame = video_frame_allocator->alloc_frame();
373                                 return;
374                         }
375                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
376                 }
377                 //dump_frame();
378                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
379
380                 // Update the assumed frame width. We might be one frame too late on format changes,
381                 // but it's much better than asking the user to choose manually.
382                 VideoFormat video_format;
383                 if (decode_video_format(format, &video_format)) {
384                         assumed_frame_width = video_format.width;
385                 }
386         }
387         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
388         //      format, timecode,
389         //      //start[7], start[6], start[5], start[4],
390         //      read_current_frame, FRAME_SIZE);
391
392         current_video_frame = video_frame_allocator->alloc_frame();
393         //if (current_video_frame.data == nullptr) {
394         //      read_current_frame = -1;
395         //} else {
396         //      read_current_frame = 0;
397         //}
398 }
399
400 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
401 {
402         uint16_t format = (start[3] << 8) | start[2];
403         uint16_t timecode = (start[1] << 8) | start[0];
404         if (current_audio_frame.len > 0) {
405                 //dump_audio_block();
406                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
407         }
408         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
409         //      format, timecode, read_current_audio_block);
410         current_audio_frame = audio_frame_allocator->alloc_frame();
411 }
412
413 #if 0
414 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
415 {
416         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
417         for (unsigned j = 0; j < pack->actual_length; j++) {
418         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
419                 printf("%02x", xfr->buffer[j + offset]);
420                 if ((j % 16) == 15)
421                         printf("\n");
422                 else if ((j % 8) == 7)
423                         printf("  ");
424                 else
425                         printf(" ");
426         }
427 }
428 #endif
429
430 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
431 {
432         assert(n % 2 == 0);
433         uint8_t *dptr1 = dest1;
434         uint8_t *dptr2 = dest2;
435
436         for (size_t i = 0; i < n; i += 2) {
437                 *dptr1++ = *src++;
438                 *dptr2++ = *src++;
439         }
440 }
441
442 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
443 {
444         if (current_frame->data == nullptr ||
445             current_frame->len > current_frame->size ||
446             start == end) {
447                 return;
448         }
449
450         int bytes = end - start;
451         if (current_frame->len + bytes > current_frame->size) {
452                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
453                 current_frame->len = current_frame->size;
454                 if (current_frame->overflow > 1048576) {
455                         printf("%d bytes overflow after last %s frame\n",
456                                 int(current_frame->overflow), frame_type_name);
457                         current_frame->overflow = 0;
458                 }
459                 //dump_frame();
460         } else {
461                 if (current_frame->interleaved) {
462                         uint8_t *data = current_frame->data + current_frame->len / 2;
463                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
464                         if (current_frame->len % 2 == 1) {
465                                 ++data;
466                                 swap(data, data2);
467                         }
468                         if (bytes % 2 == 1) {
469                                 *data++ = *start++;
470                                 swap(data, data2);
471                                 ++current_frame->len;
472                                 --bytes;
473                         }
474                         memcpy_interleaved(data, data2, start, bytes);
475                         current_frame->len += bytes;
476                 } else {
477                         memcpy(current_frame->data + current_frame->len, start, bytes);
478                         current_frame->len += bytes;
479                 }
480         }
481 }
482
483 #if 0
484 void avx2_dump(const char *name, __m256i n)
485 {
486         printf("%-10s:", name);
487         printf(" %02x", _mm256_extract_epi8(n, 0));
488         printf(" %02x", _mm256_extract_epi8(n, 1));
489         printf(" %02x", _mm256_extract_epi8(n, 2));
490         printf(" %02x", _mm256_extract_epi8(n, 3));
491         printf(" %02x", _mm256_extract_epi8(n, 4));
492         printf(" %02x", _mm256_extract_epi8(n, 5));
493         printf(" %02x", _mm256_extract_epi8(n, 6));
494         printf(" %02x", _mm256_extract_epi8(n, 7));
495         printf(" ");
496         printf(" %02x", _mm256_extract_epi8(n, 8));
497         printf(" %02x", _mm256_extract_epi8(n, 9));
498         printf(" %02x", _mm256_extract_epi8(n, 10));
499         printf(" %02x", _mm256_extract_epi8(n, 11));
500         printf(" %02x", _mm256_extract_epi8(n, 12));
501         printf(" %02x", _mm256_extract_epi8(n, 13));
502         printf(" %02x", _mm256_extract_epi8(n, 14));
503         printf(" %02x", _mm256_extract_epi8(n, 15));
504         printf(" ");
505         printf(" %02x", _mm256_extract_epi8(n, 16));
506         printf(" %02x", _mm256_extract_epi8(n, 17));
507         printf(" %02x", _mm256_extract_epi8(n, 18));
508         printf(" %02x", _mm256_extract_epi8(n, 19));
509         printf(" %02x", _mm256_extract_epi8(n, 20));
510         printf(" %02x", _mm256_extract_epi8(n, 21));
511         printf(" %02x", _mm256_extract_epi8(n, 22));
512         printf(" %02x", _mm256_extract_epi8(n, 23));
513         printf(" ");
514         printf(" %02x", _mm256_extract_epi8(n, 24));
515         printf(" %02x", _mm256_extract_epi8(n, 25));
516         printf(" %02x", _mm256_extract_epi8(n, 26));
517         printf(" %02x", _mm256_extract_epi8(n, 27));
518         printf(" %02x", _mm256_extract_epi8(n, 28));
519         printf(" %02x", _mm256_extract_epi8(n, 29));
520         printf(" %02x", _mm256_extract_epi8(n, 30));
521         printf(" %02x", _mm256_extract_epi8(n, 31));
522         printf("\n");
523 }
524 #endif
525
526 #ifndef HAS_MULTIVERSIONING
527
528 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
529 {
530         // No fast path possible unless we have multiversioning.
531         return start;
532 }
533
534 #else  // defined(HAS_MULTIVERSIONING)
535
536 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
537
538 // Does a memcpy and memchr in one to reduce processing time.
539 // Note that the benefit is somewhat limited if your L3 cache is small,
540 // as you'll (unfortunately) spend most of the time loading the data
541 // from main memory.
542 //
543 // Complicated cases are left to the slow path; it basically stops copying
544 // up until the first instance of "sync_char" (usually a bit before, actually).
545 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
546 // data, and what we really need this for is the 00 00 ff ff marker in video data.
547 __attribute__((target("default")))
548 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
549 {
550         // No fast path possible unless we have SSE 4.1 or higher.
551         return start;
552 }
553
554 __attribute__((target("sse4.1", "avx2")))
555 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
556 {
557         if (current_frame->data == nullptr ||
558             current_frame->len > current_frame->size ||
559             start == limit) {
560                 return start;
561         }
562         size_t orig_bytes = limit - start;
563         if (orig_bytes < 128) {
564                 // Don't bother.
565                 return start;
566         }
567
568         // Don't read more bytes than we can write.
569         limit = min(limit, start + (current_frame->size - current_frame->len));
570
571         // Align end to 32 bytes.
572         limit = (const uint8_t *)(intptr_t(limit) & ~31);
573
574         if (start >= limit) {
575                 return start;
576         }
577
578         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
579         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
580         if (aligned_start != start) {
581                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
582                 if (sync_start == nullptr) {
583                         add_to_frame(current_frame, "", start, aligned_start);
584                 } else {
585                         add_to_frame(current_frame, "", start, sync_start);
586                         return sync_start;
587                 }
588         }
589
590         // Make the length a multiple of 64.
591         if (current_frame->interleaved) {
592                 if (((limit - aligned_start) % 64) != 0) {
593                         limit -= 32;
594                 }
595                 assert(((limit - aligned_start) % 64) == 0);
596         }
597
598         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
599 }
600
601 __attribute__((target("avx2")))
602 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
603 {
604         const __m256i needle = _mm256_set1_epi8(sync_char);
605
606         const __restrict __m256i *in = (const __m256i *)aligned_start;
607         if (current_frame->interleaved) {
608                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
609                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
610                 if (current_frame->len % 2 == 1) {
611                         swap(out1, out2);
612                 }
613
614                 __m256i shuffle_cw = _mm256_set_epi8(
615                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
616                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
617                 while (in < (const __m256i *)limit) {
618                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
619                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
620                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
621
622                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
623                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
624                         __m256i found = _mm256_or_si256(found1, found2);
625
626                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
627                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
628                 
629                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
630                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
631
632                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
633                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
634
635                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
636                         _mm256_storeu_si256(out2, hi);
637
638                         if (!_mm256_testz_si256(found, found)) {
639                                 break;
640                         }
641
642                         in += 2;
643                         ++out1;
644                         ++out2;
645                 }
646                 current_frame->len += (uint8_t *)in - aligned_start;
647         } else {
648                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
649                 while (in < (const __m256i *)limit) {
650                         __m256i data = _mm256_load_si256(in);
651                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
652                         __m256i found = _mm256_cmpeq_epi8(data, needle);
653                         if (!_mm256_testz_si256(found, found)) {
654                                 break;
655                         }
656
657                         ++in;
658                         ++out;
659                 }
660                 current_frame->len = (uint8_t *)out - current_frame->data;
661         }
662
663         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
664         return (const uint8_t *)in;
665 }
666
667 __attribute__((target("sse4.1")))
668 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
669 {
670         const __m128i needle = _mm_set1_epi8(sync_char);
671
672         const __m128i *in = (const __m128i *)aligned_start;
673         if (current_frame->interleaved) {
674                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
675                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
676                 if (current_frame->len % 2 == 1) {
677                         swap(out1, out2);
678                 }
679
680                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
681                 while (in < (const __m128i *)limit) {
682                         __m128i data1 = _mm_load_si128(in);
683                         __m128i data2 = _mm_load_si128(in + 1);
684                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
685                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
686                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
687                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
688                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
689                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
690                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
691                         _mm_storeu_si128(out2, hi);
692                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
693                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
694                         if (!_mm_testz_si128(found1, found1) ||
695                             !_mm_testz_si128(found2, found2)) {
696                                 break;
697                         }
698
699                         in += 2;
700                         ++out1;
701                         ++out2;
702                 }
703                 current_frame->len += (uint8_t *)in - aligned_start;
704         } else {
705                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
706                 while (in < (const __m128i *)limit) {
707                         __m128i data = _mm_load_si128(in);
708                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
709                         __m128i found = _mm_cmpeq_epi8(data, needle);
710                         if (!_mm_testz_si128(found, found)) {
711                                 break;
712                         }
713
714                         ++in;
715                         ++out;
716                 }
717                 current_frame->len = (uint8_t *)out - current_frame->data;
718         }
719
720         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
721         return (const uint8_t *)in;
722 }
723
724 #endif  // defined(HAS_MULTIVERSIONING)
725
726 void decode_packs(const libusb_transfer *xfr,
727                   const char *sync_pattern,
728                   int sync_length,
729                   FrameAllocator::Frame *current_frame,
730                   const char *frame_type_name,
731                   function<void(const uint8_t *start)> start_callback)
732 {
733         int offset = 0;
734         for (int i = 0; i < xfr->num_iso_packets; i++) {
735                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
736
737                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
738                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
739                         continue;
740 //exit(5);
741                 }
742
743                 const uint8_t *start = xfr->buffer + offset;
744                 const uint8_t *limit = start + pack->actual_length;
745                 while (start < limit) {  // Usually runs only one iteration.
746                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
747                         if (start == limit) break;
748                         assert(start < limit);
749
750                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
751                         if (start_next_frame == nullptr) {
752                                 // add the rest of the buffer
753                                 add_to_frame(current_frame, frame_type_name, start, limit);
754                                 break;
755                         } else {
756                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
757                                 start = start_next_frame + sync_length;  // skip sync
758                                 start_callback(start);
759                         }
760                 }
761 #if 0
762                 dump_pack(xfr, offset, pack);
763 #endif
764                 offset += pack->length;
765         }
766 }
767
768 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
769 {
770         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
771             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
772                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
773                 libusb_free_transfer(xfr);
774                 exit(3);
775         }
776
777         assert(xfr->user_data != nullptr);
778         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
779
780         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
781                 if (!usb->disconnected) {
782                         fprintf(stderr, "Device went away, stopping transfers.\n");
783                         usb->disconnected = true;
784                         if (usb->card_disconnected_callback) {
785                                 usb->card_disconnected_callback();
786                         }
787                 }
788                 // Don't reschedule the transfer; the loop will stop by itself.
789                 return;
790         }
791
792         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
793                 if (xfr->endpoint == 0x84) {
794                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
795                 } else {
796                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
797
798                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
799                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
800                 }
801         }
802         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
803                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
804                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
805 #if 0
806                 if (setup->wIndex == 44) {
807                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
808                 } else {
809                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
810                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
811                 }
812 #else
813                 memcpy(usb->register_file + usb->current_register, buf, 4);
814                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
815                 if (usb->current_register == 0) {
816                         // read through all of them
817                         printf("register dump:");
818                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
819                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
820                         }
821                         printf("\n");
822                 }
823                 libusb_fill_control_setup(xfr->buffer,
824                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
825                         /*index=*/usb->current_register, /*length=*/4);
826 #endif
827         }
828
829 #if 0
830         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
831         for (i = 0; i < xfr->actual_length; i++) {
832                 printf("%02x", xfr->buffer[i]);
833                 if (i % 16)
834                         printf("\n");
835                 else if (i % 8)
836                         printf("  ");
837                 else
838                         printf(" ");
839         }
840 #endif
841
842         int rc = libusb_submit_transfer(xfr);
843         if (rc < 0) {
844                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
845                 exit(1);
846         }
847 }
848
849 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
850 {
851         if (card_connected_callback != nullptr) {
852                 libusb_device_descriptor desc;
853                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
854                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
855                         libusb_unref_device(dev);
856                         return 1;
857                 }
858
859                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
860                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
861                         card_connected_callback(dev);  // Callback takes ownership.
862                         return 0;
863                 }
864         }
865         libusb_unref_device(dev);
866         return 0;
867 }
868
869 void BMUSBCapture::usb_thread_func()
870 {
871         sched_param param;
872         memset(&param, 0, sizeof(param));
873         param.sched_priority = 1;
874         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
875                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
876         }
877         while (!should_quit) {
878                 int rc = libusb_handle_events(nullptr);
879                 if (rc != LIBUSB_SUCCESS)
880                         break;
881         }
882 }
883
884 struct USBCardDevice {
885         uint16_t product;
886         uint8_t bus, port;
887         libusb_device *device;
888 };
889
890 const char *get_product_name(uint16_t product)
891 {
892         if (product == 0xbd3b) {
893                 return "Intensity Shuttle";
894         } else if (product == 0xbd4f) {
895                 return "UltraStudio SDI";
896         } else {
897                 assert(false);
898                 return nullptr;
899         }
900 }
901
902 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
903 {
904         const char *product_name = get_product_name(product);
905
906         char buf[256];
907         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
908                 id, bus, port, product_name);
909         return buf;
910 }
911
912 libusb_device_handle *open_card(int card_index, string *description)
913 {
914         libusb_device **devices;
915         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
916         if (num_devices == -1) {
917                 fprintf(stderr, "Error finding USB devices\n");
918                 exit(1);
919         }
920         vector<USBCardDevice> found_cards;
921         for (ssize_t i = 0; i < num_devices; ++i) {
922                 libusb_device_descriptor desc;
923                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
924                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
925                         exit(1);
926                 }
927
928                 uint8_t bus = libusb_get_bus_number(devices[i]);
929                 uint8_t port = libusb_get_port_number(devices[i]);
930
931                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
932                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
933                         libusb_unref_device(devices[i]);
934                         continue;
935                 }
936
937                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
938         }
939         libusb_free_device_list(devices, 0);
940
941         // Sort the devices to get a consistent ordering.
942         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
943                 if (a.product != b.product)
944                         return a.product < b.product;
945                 if (a.bus != b.bus)
946                         return a.bus < b.bus;
947                 return a.port < b.port;
948         });
949
950         for (size_t i = 0; i < found_cards.size(); ++i) {
951                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
952                 fprintf(stderr, "%s\n", tmp_description.c_str());
953                 if (i == size_t(card_index)) {
954                         *description = tmp_description;
955                 }
956         }
957
958         if (size_t(card_index) >= found_cards.size()) {
959                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
960                 exit(1);
961         }
962
963         libusb_device_handle *devh;
964         int rc = libusb_open(found_cards[card_index].device, &devh);
965         if (rc < 0) {
966                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
967                 exit(1);
968         }
969
970         for (size_t i = 0; i < found_cards.size(); ++i) {
971                 libusb_unref_device(found_cards[i].device);
972         }
973
974         return devh;
975 }
976
977 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
978 {
979         uint8_t bus = libusb_get_bus_number(dev);
980         uint8_t port = libusb_get_port_number(dev);
981
982         libusb_device_descriptor desc;
983         if (libusb_get_device_descriptor(dev, &desc) < 0) {
984                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
985                 exit(1);
986         }
987
988         *description = get_card_description(card_index, bus, port, desc.idProduct);
989
990         libusb_device_handle *devh;
991         int rc = libusb_open(dev, &devh);
992         if (rc < 0) {
993                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
994                 exit(1);
995         }
996
997         return devh;
998 }
999
1000 void BMUSBCapture::configure_card()
1001 {
1002         if (video_frame_allocator == nullptr) {
1003                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1004                 set_video_frame_allocator(owned_video_frame_allocator.get());
1005         }
1006         if (audio_frame_allocator == nullptr) {
1007                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1008                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1009         }
1010         dequeue_thread_should_quit = false;
1011         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1012
1013         int rc;
1014         struct libusb_transfer *xfr;
1015
1016         rc = libusb_init(nullptr);
1017         if (rc < 0) {
1018                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1019                 exit(1);
1020         }
1021
1022         if (dev == nullptr) {
1023                 devh = open_card(card_index, &description);
1024         } else {
1025                 devh = open_card(card_index, dev, &description);
1026                 libusb_unref_device(dev);
1027         }
1028         if (!devh) {
1029                 fprintf(stderr, "Error finding USB device\n");
1030                 exit(1);
1031         }
1032
1033         libusb_config_descriptor *config;
1034         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1035         if (rc < 0) {
1036                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1037                 exit(1);
1038         }
1039
1040 #if 0
1041         printf("%d interface\n", config->bNumInterfaces);
1042         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1043                 printf("  interface %d\n", interface_number);
1044                 const libusb_interface *interface = &config->interface[interface_number];
1045                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1046                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1047                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1048                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1049                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1050                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1051                         }
1052                 }
1053         }
1054 #endif
1055
1056         rc = libusb_set_configuration(devh, /*configuration=*/1);
1057         if (rc < 0) {
1058                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1059                 exit(1);
1060         }
1061
1062         rc = libusb_claim_interface(devh, 0);
1063         if (rc < 0) {
1064                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1065                 exit(1);
1066         }
1067
1068         // Alternate setting 1 is output, alternate setting 2 is input.
1069         // Card is reset when switching alternates, so the driver uses
1070         // this “double switch” when it wants to reset.
1071         //
1072         // There's also alternate settings 3 and 4, which seem to be
1073         // like 1 and 2 except they advertise less bandwidth needed.
1074         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1075         if (rc < 0) {
1076                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1077                 exit(1);
1078         }
1079         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1080         if (rc < 0) {
1081                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1082                 exit(1);
1083         }
1084 #if 0
1085         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1086         if (rc < 0) {
1087                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1088                 exit(1);
1089         }
1090 #endif
1091
1092 #if 0
1093         rc = libusb_claim_interface(devh, 3);
1094         if (rc < 0) {
1095                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1096                 exit(1);
1097         }
1098 #endif
1099
1100         // theories:
1101         //   44 is some kind of timer register (first 16 bits count upwards)
1102         //   24 is some sort of watchdog?
1103         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1104         //      (or will go to 0x73c60010?), also seen 0x73c60100
1105         //   12 also changes all the time, unclear why  
1106         //   16 seems to be autodetected mode somehow
1107         //      --    this is e00115e0 after reset?
1108         //                    ed0115e0 after mode change [to output?]
1109         //                    2d0015e0 after more mode change [to input]
1110         //                    ed0115e0 after more mode change
1111         //                    2d0015e0 after more mode change
1112         //
1113         //                    390115e0 seems to indicate we have signal
1114         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1115         //
1116         //                    200015e0 on startup
1117         //         changes to 250115e0 when we sync to the signal
1118         //
1119         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1120         //
1121         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1122         //
1123         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1124         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1125         //
1126         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1127         //    perhaps some of them are related to analog output?
1128         //
1129         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1130         //    but the driver sets it to 0x8036802a at some point.
1131         //
1132         //    all of this is on request 214/215. other requests (192, 219,
1133         //    222, 223, 224) are used for firmware upgrade. Probably best to
1134         //    stay out of it unless you know what you're doing.
1135         //
1136         //
1137         // register 16:
1138         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1139         //
1140         // theories:
1141         //   0x01 - stable signal
1142         //   0x04 - deep color
1143         //   0x08 - unknown (audio??)
1144         //   0x20 - 720p??
1145         //   0x30 - 576p??
1146
1147         update_capture_mode();
1148
1149         struct ctrl {
1150                 int endpoint;
1151                 int request;
1152                 int index;
1153                 uint32_t data;
1154         };
1155         static const ctrl ctrls[] = {
1156                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1157                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1158
1159                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1160                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1161                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1162                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1163         };
1164
1165         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1166                 uint32_t flipped = htonl(ctrls[req].data);
1167                 static uint8_t value[4];
1168                 memcpy(value, &flipped, sizeof(flipped));
1169                 int size = sizeof(value);
1170                 //if (ctrls[req].request == 215) size = 0;
1171                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1172                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1173                 if (rc < 0) {
1174                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1175                         exit(1);
1176                 }
1177
1178                 if (ctrls[req].index == 16 && rc == 4) {
1179                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1180                 }
1181
1182 #if 0
1183                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1184                 for (int i = 0; i < rc; ++i) {
1185                         printf("%02x", value[i]);
1186                 }
1187                 printf("\n");
1188 #endif
1189         }
1190
1191 #if 0
1192         // DEBUG
1193         for ( ;; ) {
1194                 static int my_index = 0;
1195                 static uint8_t value[4];
1196                 int size = sizeof(value);
1197                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1198                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1199                 if (rc < 0) {
1200                         fprintf(stderr, "Error on control\n");
1201                         exit(1);
1202                 }
1203                 printf("rc=%d index=%d: 0x", rc, my_index);
1204                 for (int i = 0; i < rc; ++i) {
1205                         printf("%02x", value[i]);
1206                 }
1207                 printf("\n");
1208         }
1209 #endif
1210
1211 #if 0
1212         // set up an asynchronous transfer of the timer register
1213         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1214         static int completed = 0;
1215
1216         xfr = libusb_alloc_transfer(0);
1217         libusb_fill_control_setup(cmdbuf,
1218             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1219                 /*index=*/44, /*length=*/4);
1220         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1221         xfr->user_data = this;
1222         libusb_submit_transfer(xfr);
1223
1224         // set up an asynchronous transfer of register 24
1225         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1226         static int completed2 = 0;
1227
1228         xfr = libusb_alloc_transfer(0);
1229         libusb_fill_control_setup(cmdbuf2,
1230             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1231                 /*index=*/24, /*length=*/4);
1232         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1233         xfr->user_data = this;
1234         libusb_submit_transfer(xfr);
1235 #endif
1236
1237         // set up an asynchronous transfer of the register dump
1238         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1239         static int completed3 = 0;
1240
1241         xfr = libusb_alloc_transfer(0);
1242         libusb_fill_control_setup(cmdbuf3,
1243             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1244                 /*index=*/current_register, /*length=*/4);
1245         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1246         xfr->user_data = this;
1247         //libusb_submit_transfer(xfr);
1248
1249         //audiofp = fopen("audio.raw", "wb");
1250
1251         // set up isochronous transfers for audio and video
1252         for (int e = 3; e <= 4; ++e) {
1253                 //int num_transfers = (e == 3) ? 6 : 6;
1254                 int num_transfers = 6;
1255                 for (int i = 0; i < num_transfers; ++i) {
1256                         size_t buf_size;
1257                         int num_iso_pack, size;
1258                         if (e == 3) {
1259                                 // Allocate for minimum width (because that will give us the most
1260                                 // number of packets, so we don't need to reallocated, but we'll
1261                                 // default to 720p for the first frame.
1262                                 size = find_xfer_size_for_width(MIN_WIDTH);
1263                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1264                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1265                         } else {
1266                                 size = 0xc0;
1267                                 num_iso_pack = 80;
1268                                 buf_size = num_iso_pack * size;
1269                         }
1270                         int num_bytes = num_iso_pack * size;
1271                         assert(size_t(num_bytes) <= buf_size);
1272 #if LIBUSB_API_VERSION >= 0x01000105
1273                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1274 #else
1275                         uint8_t *buf = nullptr;
1276 #endif
1277                         if (buf == nullptr) {
1278                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1279 #if LIBUSB_API_VERSION >= 0x01000105
1280                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1281 #else
1282                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1283 #endif
1284                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1285                                 buf = new uint8_t[num_bytes];
1286                         }
1287
1288                         xfr = libusb_alloc_transfer(num_iso_pack);
1289                         if (!xfr) {
1290                                 fprintf(stderr, "oom\n");
1291                                 exit(1);
1292                         }
1293
1294                         int ep = LIBUSB_ENDPOINT_IN | e;
1295                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1296                                 num_iso_pack, cb_xfr, nullptr, 0);
1297                         libusb_set_iso_packet_lengths(xfr, size);
1298                         xfr->user_data = this;
1299
1300                         if (e == 3) {
1301                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1302                         }
1303
1304                         iso_xfrs.push_back(xfr);
1305                 }
1306         }
1307 }
1308
1309 void BMUSBCapture::start_bm_capture()
1310 {
1311         int i = 0;
1312         for (libusb_transfer *xfr : iso_xfrs) {
1313                 int rc = libusb_submit_transfer(xfr);
1314                 ++i;
1315                 if (rc < 0) {
1316                         //printf("num_bytes=%d\n", num_bytes);
1317                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1318                                 xfr->endpoint, i, libusb_error_name(rc));
1319                         exit(1);
1320                 }
1321         }
1322
1323
1324 #if 0
1325         libusb_release_interface(devh, 0);
1326 out:
1327         if (devh)
1328                 libusb_close(devh);
1329         libusb_exit(nullptr);
1330         return rc;
1331 #endif
1332 }
1333
1334 void BMUSBCapture::stop_dequeue_thread()
1335 {
1336         dequeue_thread_should_quit = true;
1337         queues_not_empty.notify_all();
1338         dequeue_thread.join();
1339 }
1340
1341 void BMUSBCapture::start_bm_thread()
1342 {
1343         // Devices leaving are discovered by seeing the isochronous packets
1344         // coming back with errors, so only care about devices joining.
1345         if (card_connected_callback != nullptr) {
1346                 if (libusb_hotplug_register_callback(
1347                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, LIBUSB_HOTPLUG_NO_FLAGS,
1348                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1349                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1350                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1351                         exit(1);
1352                 }
1353         }
1354
1355         should_quit = false;
1356         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1357 }
1358
1359 void BMUSBCapture::stop_bm_thread()
1360 {
1361         should_quit = true;
1362         usb_thread.join();
1363 }
1364
1365 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1366 {
1367         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1368         VideoMode auto_mode;
1369         auto_mode.name = "Autodetect";
1370         auto_mode.autodetect = true;
1371         return {{ 0, auto_mode }};
1372 }
1373
1374 uint32_t BMUSBCapture::get_current_video_mode() const
1375 {
1376         return 0;  // Matches get_available_video_modes().
1377 }
1378
1379 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1380 {
1381         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1382 }
1383
1384 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1385 {
1386         return {
1387                 { 0x00000000, "HDMI/SDI" },
1388                 { 0x02000000, "Component" },
1389                 { 0x04000000, "Composite" },
1390                 { 0x06000000, "S-video" }
1391         };
1392 }
1393
1394 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1395 {
1396         assert((video_input_id & ~0x06000000) == 0);
1397         current_video_input = video_input_id;
1398         update_capture_mode();
1399 }
1400
1401 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1402 {
1403         return {
1404                 { 0x00000000, "Embedded" },
1405                 { 0x10000000, "Analog" }
1406         };
1407 }
1408
1409 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1410 {
1411         assert((audio_input_id & ~0x10000000) == 0);
1412         current_audio_input = audio_input_id;
1413         update_capture_mode();
1414 }
1415
1416 void BMUSBCapture::update_capture_mode()
1417 {
1418         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1419         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1420         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1421
1422         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1423                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1424         if (rc < 0) {
1425                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1426                 exit(1);
1427         }
1428 }