]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Fix a frame leak (leading to freeze) when getting an unknown resolution.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.6.0
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
203                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
204                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
205                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
206                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
207                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
208                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
209                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
210                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
211                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
212                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
213                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
214                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
215         };
216         for (const VideoFormatEntry &entry : entries) {
217                 if (normalized_video_format == entry.normalized_video_format) {
218                         decoded_video_format->width = entry.width;
219                         decoded_video_format->height = entry.height;
220                         if (video_format & 0x0800) {
221                                 decoded_video_format->stride = entry.width * 2;
222                         } else {
223                                 decoded_video_format->stride = v210_stride(entry.width);
224                         }
225                         decoded_video_format->second_field_start = entry.second_field_start;
226                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
227                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
228                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
229                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
230                         decoded_video_format->interlaced = entry.interlaced;
231                         return true;
232                 }
233         }
234
235         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
236         decoded_video_format->width = 1280;
237         decoded_video_format->height = 720;
238         decoded_video_format->stride = 1280 * 2;
239         decoded_video_format->frame_rate_nom = 60;
240         decoded_video_format->frame_rate_den = 1;
241         return false;
242 }
243
244 // There are seemingly no direct indicators of sample rate; you just get
245 // one frame's worth and have to guess from that.
246 int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_rate)
247 {
248         size_t num_samples = len / 3 / 8;
249         size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
250
251         // See if we match or are very close to any of the mandatory HDMI sample rates.
252         const int candidate_sample_rates[] = { 32000, 44100, 48000 };
253         for (int rate : candidate_sample_rates) {
254                 if (abs(int(num_samples_per_second) - rate) < 50) {
255                         return rate;
256                 }
257         }
258
259         fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, keeping capture at %d Hz\n",
260                 num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second, default_rate);
261         return default_rate;
262 }
263
264 }  // namespace
265
266 FrameAllocator::~FrameAllocator() {}
267
268 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
269         : frame_size(frame_size)
270 {
271         for (size_t i = 0; i < num_queued_frames; ++i) {
272                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
273         }
274 }
275
276 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
277 {
278         Frame vf;
279         vf.owner = this;
280
281         unique_lock<mutex> lock(freelist_mutex);  // Meh.
282         if (freelist.empty()) {
283                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
284                         frame_size);
285         } else {
286                 vf.data = freelist.top().release();
287                 vf.size = frame_size;
288                 freelist.pop();  // Meh.
289         }
290         return vf;
291 }
292
293 void MallocFrameAllocator::release_frame(Frame frame)
294 {
295         if (frame.overflow > 0) {
296                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
297         }
298         unique_lock<mutex> lock(freelist_mutex);
299         freelist.push(unique_ptr<uint8_t[]>(frame.data));
300 }
301
302 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
303 {
304         if (a == b) {
305                 return false;
306         } else if (a < b) {
307                 return (b - a < 0x8000);
308         } else {
309                 int wrap_b = 0x10000 + int(b);
310                 return (wrap_b - a < 0x8000);
311         }
312 }
313
314 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
315 {
316         unique_lock<mutex> lock(queue_lock);
317         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
318                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
319                         q->back().timecode, timecode);
320                 frame.owner->release_frame(frame);
321                 return;
322         }
323
324         QueuedFrame qf;
325         qf.format = format;
326         qf.timecode = timecode;
327         qf.frame = frame;
328         q->push_back(move(qf));
329         queues_not_empty.notify_one();  // might be spurious
330 }
331
332 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
333 {
334         FILE *fp = fopen(filename, "wb");
335         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
336                 printf("short write!\n");
337         }
338         fclose(fp);
339 }
340
341 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
342 {
343         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
344 }
345
346 void BMUSBCapture::dequeue_thread_func()
347 {
348         char thread_name[16];
349         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
350         pthread_setname_np(pthread_self(), thread_name);
351
352         if (has_dequeue_callbacks) {
353                 dequeue_init_callback();
354         }
355         size_t last_sample_rate = 48000;
356         while (!dequeue_thread_should_quit) {
357                 unique_lock<mutex> lock(queue_lock);
358                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
359
360                 if (dequeue_thread_should_quit) break;
361
362                 uint16_t video_timecode = pending_video_frames.front().timecode;
363                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
364                 AudioFormat audio_format;
365                 audio_format.bits_per_sample = 24;
366                 audio_format.num_channels = 8;
367                 audio_format.sample_rate = last_sample_rate;
368                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
369                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
370                                 video_timecode);
371                         QueuedFrame video_frame = pending_video_frames.front();
372                         pending_video_frames.pop_front();
373                         lock.unlock();
374                         video_frame_allocator->release_frame(video_frame.frame);
375                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
376                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
377                                 audio_timecode);
378                         QueuedFrame audio_frame = pending_audio_frames.front();
379                         pending_audio_frames.pop_front();
380                         lock.unlock();
381                         audio_format.id = audio_frame.format;
382
383                         // Use the video format of the pending frame.
384                         QueuedFrame video_frame = pending_video_frames.front();
385                         VideoFormat video_format;
386                         decode_video_format(video_frame.format, &video_format);
387
388                         frame_callback(audio_timecode,
389                                        FrameAllocator::Frame(), 0, video_format,
390                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
391                 } else {
392                         QueuedFrame video_frame = pending_video_frames.front();
393                         QueuedFrame audio_frame = pending_audio_frames.front();
394                         pending_audio_frames.pop_front();
395                         pending_video_frames.pop_front();
396                         lock.unlock();
397
398 #if 0
399                         char filename[255];
400                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
401                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
402                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
403 #endif
404
405                         VideoFormat video_format;
406                         audio_format.id = audio_frame.format;
407                         if (decode_video_format(video_frame.format, &video_format)) {
408                                 if (audio_frame.frame.len != 0) {
409                                         audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len, last_sample_rate);
410                                         last_sample_rate = audio_format.sample_rate;
411                                 }
412                                 frame_callback(video_timecode,
413                                                video_frame.frame, HEADER_SIZE, video_format,
414                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
415                         } else {
416                                 video_frame_allocator->release_frame(video_frame.frame);
417                                 audio_format.sample_rate = last_sample_rate;
418                                 frame_callback(video_timecode,
419                                                FrameAllocator::Frame(), 0, video_format,
420                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
421                         }
422                 }
423         }
424         if (has_dequeue_callbacks) {
425                 dequeue_cleanup_callback();
426         }
427 }
428
429 void BMUSBCapture::start_new_frame(const uint8_t *start)
430 {
431         uint16_t format = (start[3] << 8) | start[2];
432         uint16_t timecode = (start[1] << 8) | start[0];
433
434         if (current_video_frame.len > 0) {
435                 current_video_frame.received_timestamp = steady_clock::now();
436
437                 // If format is 0x0800 (no signal), add a fake (empty) audio
438                 // frame to get it out of the queue.
439                 // TODO: Figure out if there are other formats that come with
440                 // no audio, and treat them the same.
441                 if (format == 0x0800) {
442                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
443                         if (fake_audio_frame.data == nullptr) {
444                                 // Oh well, it's just a no-signal frame anyway.
445                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
446                                 current_video_frame.owner->release_frame(current_video_frame);
447                                 current_video_frame = video_frame_allocator->alloc_frame();
448                                 return;
449                         }
450                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
451                 }
452                 //dump_frame();
453                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
454
455                 // Update the assumed frame width. We might be one frame too late on format changes,
456                 // but it's much better than asking the user to choose manually.
457                 VideoFormat video_format;
458                 if (decode_video_format(format, &video_format)) {
459                         assumed_frame_width = video_format.width;
460                 }
461         }
462         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
463         //      format, timecode,
464         //      //start[7], start[6], start[5], start[4],
465         //      read_current_frame, FRAME_SIZE);
466
467         current_video_frame = video_frame_allocator->alloc_frame();
468         //if (current_video_frame.data == nullptr) {
469         //      read_current_frame = -1;
470         //} else {
471         //      read_current_frame = 0;
472         //}
473 }
474
475 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
476 {
477         uint16_t format = (start[3] << 8) | start[2];
478         uint16_t timecode = (start[1] << 8) | start[0];
479         if (current_audio_frame.len > 0) {
480                 current_audio_frame.received_timestamp = steady_clock::now();
481                 //dump_audio_block();
482                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
483         }
484         //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
485         //      format, timecode);
486         current_audio_frame = audio_frame_allocator->alloc_frame();
487 }
488
489 #if 0
490 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
491 {
492         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
493         for (unsigned j = 0; j < pack->actual_length; j++) {
494         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
495                 printf("%02x", xfr->buffer[j + offset]);
496                 if ((j % 16) == 15)
497                         printf("\n");
498                 else if ((j % 8) == 7)
499                         printf("  ");
500                 else
501                         printf(" ");
502         }
503 }
504 #endif
505
506 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
507 {
508         assert(n % 2 == 0);
509         uint8_t *dptr1 = dest1;
510         uint8_t *dptr2 = dest2;
511
512         for (size_t i = 0; i < n; i += 2) {
513                 *dptr1++ = *src++;
514                 *dptr2++ = *src++;
515         }
516 }
517
518 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
519 {
520         if (current_frame->data == nullptr ||
521             current_frame->len > current_frame->size ||
522             start == end) {
523                 return;
524         }
525
526         int bytes = end - start;
527         if (current_frame->len + bytes > current_frame->size) {
528                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
529                 current_frame->len = current_frame->size;
530                 if (current_frame->overflow > 1048576) {
531                         printf("%d bytes overflow after last %s frame\n",
532                                 int(current_frame->overflow), frame_type_name);
533                         current_frame->overflow = 0;
534                 }
535                 //dump_frame();
536         } else {
537                 if (current_frame->interleaved) {
538                         uint8_t *data = current_frame->data + current_frame->len / 2;
539                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
540                         if (current_frame->len % 2 == 1) {
541                                 ++data;
542                                 swap(data, data2);
543                         }
544                         if (bytes % 2 == 1) {
545                                 *data++ = *start++;
546                                 swap(data, data2);
547                                 ++current_frame->len;
548                                 --bytes;
549                         }
550                         memcpy_interleaved(data, data2, start, bytes);
551                         current_frame->len += bytes;
552                 } else {
553                         memcpy(current_frame->data + current_frame->len, start, bytes);
554                         current_frame->len += bytes;
555                 }
556         }
557 }
558
559 #if 0
560 void avx2_dump(const char *name, __m256i n)
561 {
562         printf("%-10s:", name);
563         printf(" %02x", _mm256_extract_epi8(n, 0));
564         printf(" %02x", _mm256_extract_epi8(n, 1));
565         printf(" %02x", _mm256_extract_epi8(n, 2));
566         printf(" %02x", _mm256_extract_epi8(n, 3));
567         printf(" %02x", _mm256_extract_epi8(n, 4));
568         printf(" %02x", _mm256_extract_epi8(n, 5));
569         printf(" %02x", _mm256_extract_epi8(n, 6));
570         printf(" %02x", _mm256_extract_epi8(n, 7));
571         printf(" ");
572         printf(" %02x", _mm256_extract_epi8(n, 8));
573         printf(" %02x", _mm256_extract_epi8(n, 9));
574         printf(" %02x", _mm256_extract_epi8(n, 10));
575         printf(" %02x", _mm256_extract_epi8(n, 11));
576         printf(" %02x", _mm256_extract_epi8(n, 12));
577         printf(" %02x", _mm256_extract_epi8(n, 13));
578         printf(" %02x", _mm256_extract_epi8(n, 14));
579         printf(" %02x", _mm256_extract_epi8(n, 15));
580         printf(" ");
581         printf(" %02x", _mm256_extract_epi8(n, 16));
582         printf(" %02x", _mm256_extract_epi8(n, 17));
583         printf(" %02x", _mm256_extract_epi8(n, 18));
584         printf(" %02x", _mm256_extract_epi8(n, 19));
585         printf(" %02x", _mm256_extract_epi8(n, 20));
586         printf(" %02x", _mm256_extract_epi8(n, 21));
587         printf(" %02x", _mm256_extract_epi8(n, 22));
588         printf(" %02x", _mm256_extract_epi8(n, 23));
589         printf(" ");
590         printf(" %02x", _mm256_extract_epi8(n, 24));
591         printf(" %02x", _mm256_extract_epi8(n, 25));
592         printf(" %02x", _mm256_extract_epi8(n, 26));
593         printf(" %02x", _mm256_extract_epi8(n, 27));
594         printf(" %02x", _mm256_extract_epi8(n, 28));
595         printf(" %02x", _mm256_extract_epi8(n, 29));
596         printf(" %02x", _mm256_extract_epi8(n, 30));
597         printf(" %02x", _mm256_extract_epi8(n, 31));
598         printf("\n");
599 }
600 #endif
601
602 #ifndef HAS_MULTIVERSIONING
603
604 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
605 {
606         // No fast path possible unless we have multiversioning.
607         return start;
608 }
609
610 #else  // defined(HAS_MULTIVERSIONING)
611
612 __attribute__((target("sse4.1")))
613 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
614
615 __attribute__((target("avx2")))
616 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
617
618 // Does a memcpy and memchr in one to reduce processing time.
619 // Note that the benefit is somewhat limited if your L3 cache is small,
620 // as you'll (unfortunately) spend most of the time loading the data
621 // from main memory.
622 //
623 // Complicated cases are left to the slow path; it basically stops copying
624 // up until the first instance of "sync_char" (usually a bit before, actually).
625 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
626 // data, and what we really need this for is the 00 00 ff ff marker in video data.
627 __attribute__((target("default")))
628 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
629 {
630         // No fast path possible unless we have SSE 4.1 or higher.
631         return start;
632 }
633
634 __attribute__((target("sse4.1", "avx2")))
635 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
636 {
637         if (current_frame->data == nullptr ||
638             current_frame->len > current_frame->size ||
639             start == limit) {
640                 return start;
641         }
642         size_t orig_bytes = limit - start;
643         if (orig_bytes < 128) {
644                 // Don't bother.
645                 return start;
646         }
647
648         // Don't read more bytes than we can write.
649         limit = min(limit, start + (current_frame->size - current_frame->len));
650
651         // Align end to 32 bytes.
652         limit = (const uint8_t *)(intptr_t(limit) & ~31);
653
654         if (start >= limit) {
655                 return start;
656         }
657
658         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
659         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
660         if (aligned_start != start) {
661                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
662                 if (sync_start == nullptr) {
663                         add_to_frame(current_frame, "", start, aligned_start);
664                 } else {
665                         add_to_frame(current_frame, "", start, sync_start);
666                         return sync_start;
667                 }
668         }
669
670         // Make the length a multiple of 64.
671         if (current_frame->interleaved) {
672                 if (((limit - aligned_start) % 64) != 0) {
673                         limit -= 32;
674                 }
675                 assert(((limit - aligned_start) % 64) == 0);
676         }
677
678         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
679 }
680
681 __attribute__((target("avx2")))
682 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
683 {
684         const __m256i needle = _mm256_set1_epi8(sync_char);
685
686         const __restrict __m256i *in = (const __m256i *)aligned_start;
687         if (current_frame->interleaved) {
688                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
689                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
690                 if (current_frame->len % 2 == 1) {
691                         swap(out1, out2);
692                 }
693
694                 __m256i shuffle_cw = _mm256_set_epi8(
695                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
696                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
697                 while (in < (const __m256i *)limit) {
698                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
699                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
700                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
701
702                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
703                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
704                         __m256i found = _mm256_or_si256(found1, found2);
705
706                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
707                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
708                 
709                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
710                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
711
712                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
713                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
714
715                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
716                         _mm256_storeu_si256(out2, hi);
717
718                         if (!_mm256_testz_si256(found, found)) {
719                                 break;
720                         }
721
722                         in += 2;
723                         ++out1;
724                         ++out2;
725                 }
726                 current_frame->len += (uint8_t *)in - aligned_start;
727         } else {
728                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
729                 while (in < (const __m256i *)limit) {
730                         __m256i data = _mm256_load_si256(in);
731                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
732                         __m256i found = _mm256_cmpeq_epi8(data, needle);
733                         if (!_mm256_testz_si256(found, found)) {
734                                 break;
735                         }
736
737                         ++in;
738                         ++out;
739                 }
740                 current_frame->len = (uint8_t *)out - current_frame->data;
741         }
742
743         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
744         return (const uint8_t *)in;
745 }
746
747 __attribute__((target("sse4.1")))
748 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
749 {
750         const __m128i needle = _mm_set1_epi8(sync_char);
751
752         const __m128i *in = (const __m128i *)aligned_start;
753         if (current_frame->interleaved) {
754                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
755                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
756                 if (current_frame->len % 2 == 1) {
757                         swap(out1, out2);
758                 }
759
760                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
761                 while (in < (const __m128i *)limit) {
762                         __m128i data1 = _mm_load_si128(in);
763                         __m128i data2 = _mm_load_si128(in + 1);
764                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
765                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
766                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
767                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
768                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
769                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
770                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
771                         _mm_storeu_si128(out2, hi);
772                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
773                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
774                         if (!_mm_testz_si128(found1, found1) ||
775                             !_mm_testz_si128(found2, found2)) {
776                                 break;
777                         }
778
779                         in += 2;
780                         ++out1;
781                         ++out2;
782                 }
783                 current_frame->len += (uint8_t *)in - aligned_start;
784         } else {
785                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
786                 while (in < (const __m128i *)limit) {
787                         __m128i data = _mm_load_si128(in);
788                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
789                         __m128i found = _mm_cmpeq_epi8(data, needle);
790                         if (!_mm_testz_si128(found, found)) {
791                                 break;
792                         }
793
794                         ++in;
795                         ++out;
796                 }
797                 current_frame->len = (uint8_t *)out - current_frame->data;
798         }
799
800         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
801         return (const uint8_t *)in;
802 }
803
804 #endif  // defined(HAS_MULTIVERSIONING)
805
806 void decode_packs(const libusb_transfer *xfr,
807                   const char *sync_pattern,
808                   int sync_length,
809                   FrameAllocator::Frame *current_frame,
810                   const char *frame_type_name,
811                   function<void(const uint8_t *start)> start_callback)
812 {
813         int offset = 0;
814         for (int i = 0; i < xfr->num_iso_packets; i++) {
815                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
816
817                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
818                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
819                         continue;
820 //exit(5);
821                 }
822
823                 const uint8_t *start = xfr->buffer + offset;
824                 const uint8_t *limit = start + pack->actual_length;
825                 while (start < limit) {  // Usually runs only one iteration.
826                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
827                         if (start == limit) break;
828                         assert(start < limit);
829
830                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
831                         if (start_next_frame == nullptr) {
832                                 // add the rest of the buffer
833                                 add_to_frame(current_frame, frame_type_name, start, limit);
834                                 break;
835                         } else {
836                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
837                                 start = start_next_frame + sync_length;  // skip sync
838                                 start_callback(start);
839                         }
840                 }
841 #if 0
842                 dump_pack(xfr, offset, pack);
843 #endif
844                 offset += pack->length;
845         }
846 }
847
848 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
849 {
850         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
851             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
852                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
853                 libusb_free_transfer(xfr);
854                 exit(3);
855         }
856
857         assert(xfr->user_data != nullptr);
858         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
859
860         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
861                 if (!usb->disconnected) {
862                         fprintf(stderr, "Device went away, stopping transfers.\n");
863                         usb->disconnected = true;
864                         if (usb->card_disconnected_callback) {
865                                 usb->card_disconnected_callback();
866                         }
867                 }
868                 // Don't reschedule the transfer; the loop will stop by itself.
869                 return;
870         }
871
872         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
873                 if (xfr->endpoint == 0x84) {
874                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
875                 } else {
876                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
877
878                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
879                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
880                 }
881         }
882         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
883                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
884                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
885 #if 0
886                 if (setup->wIndex == 44) {
887                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
888                 } else {
889                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
890                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
891                 }
892 #else
893                 memcpy(usb->register_file + usb->current_register, buf, 4);
894                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
895                 if (usb->current_register == 0) {
896                         // read through all of them
897                         printf("register dump:");
898                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
899                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
900                         }
901                         printf("\n");
902                 }
903                 libusb_fill_control_setup(xfr->buffer,
904                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
905                         /*index=*/usb->current_register, /*length=*/4);
906 #endif
907         }
908
909 #if 0
910         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
911         for (i = 0; i < xfr->actual_length; i++) {
912                 printf("%02x", xfr->buffer[i]);
913                 if (i % 16)
914                         printf("\n");
915                 else if (i % 8)
916                         printf("  ");
917                 else
918                         printf(" ");
919         }
920 #endif
921
922         int rc = libusb_submit_transfer(xfr);
923         if (rc < 0) {
924                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
925                 exit(1);
926         }
927 }
928
929 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
930 {
931         if (card_connected_callback != nullptr) {
932                 libusb_device_descriptor desc;
933                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
934                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
935                         libusb_unref_device(dev);
936                         return 1;
937                 }
938
939                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
940                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
941                         card_connected_callback(dev);  // Callback takes ownership.
942                         return 0;
943                 }
944         }
945         libusb_unref_device(dev);
946         return 0;
947 }
948
949 void BMUSBCapture::usb_thread_func()
950 {
951         sched_param param;
952         memset(&param, 0, sizeof(param));
953         param.sched_priority = 1;
954         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
955                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
956         }
957         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
958         while (!should_quit) {
959                 timeval sec { 1, 0 };
960                 int rc = libusb_handle_events_timeout(nullptr, &sec);
961                 if (rc != LIBUSB_SUCCESS)
962                         break;
963         }
964 }
965
966 namespace {
967
968 struct USBCardDevice {
969         uint16_t product;
970         uint8_t bus, port;
971         libusb_device *device;
972 };
973
974 const char *get_product_name(uint16_t product)
975 {
976         if (product == 0xbd3b) {
977                 return "Intensity Shuttle";
978         } else if (product == 0xbd4f) {
979                 return "UltraStudio SDI";
980         } else {
981                 assert(false);
982                 return nullptr;
983         }
984 }
985
986 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
987 {
988         const char *product_name = get_product_name(product);
989
990         char buf[256];
991         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
992                 id, bus, port, product_name);
993         return buf;
994 }
995
996 vector<USBCardDevice> find_all_cards()
997 {
998         libusb_device **devices;
999         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
1000         if (num_devices == -1) {
1001                 fprintf(stderr, "Error finding USB devices\n");
1002                 exit(1);
1003         }
1004         vector<USBCardDevice> found_cards;
1005         for (ssize_t i = 0; i < num_devices; ++i) {
1006                 libusb_device_descriptor desc;
1007                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
1008                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
1009                         exit(1);
1010                 }
1011
1012                 uint8_t bus = libusb_get_bus_number(devices[i]);
1013                 uint8_t port = libusb_get_port_number(devices[i]);
1014
1015                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
1016                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
1017                         libusb_unref_device(devices[i]);
1018                         continue;
1019                 }
1020
1021                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
1022         }
1023         libusb_free_device_list(devices, 0);
1024
1025         // Sort the devices to get a consistent ordering.
1026         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
1027                 if (a.product != b.product)
1028                         return a.product < b.product;
1029                 if (a.bus != b.bus)
1030                         return a.bus < b.bus;
1031                 return a.port < b.port;
1032         });
1033
1034         return found_cards;
1035 }
1036
1037 libusb_device_handle *open_card(int card_index, string *description)
1038 {
1039         vector<USBCardDevice> found_cards = find_all_cards();
1040
1041         for (size_t i = 0; i < found_cards.size(); ++i) {
1042                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1043                 fprintf(stderr, "%s\n", tmp_description.c_str());
1044                 if (i == size_t(card_index)) {
1045                         *description = tmp_description;
1046                 }
1047         }
1048
1049         if (size_t(card_index) >= found_cards.size()) {
1050                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1051                 exit(1);
1052         }
1053
1054         libusb_device_handle *devh;
1055         int rc = libusb_open(found_cards[card_index].device, &devh);
1056         if (rc < 0) {
1057                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1058                 exit(1);
1059         }
1060
1061         for (size_t i = 0; i < found_cards.size(); ++i) {
1062                 libusb_unref_device(found_cards[i].device);
1063         }
1064
1065         return devh;
1066 }
1067
1068 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1069 {
1070         uint8_t bus = libusb_get_bus_number(dev);
1071         uint8_t port = libusb_get_port_number(dev);
1072
1073         libusb_device_descriptor desc;
1074         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1075                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1076                 exit(1);
1077         }
1078
1079         *description = get_card_description(card_index, bus, port, desc.idProduct);
1080
1081         libusb_device_handle *devh;
1082         int rc = libusb_open(dev, &devh);
1083         if (rc < 0) {
1084                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1085                 exit(1);
1086         }
1087
1088         return devh;
1089 }
1090
1091 }  // namespace
1092
1093 unsigned BMUSBCapture::num_cards()
1094 {
1095         int rc = libusb_init(nullptr);
1096         if (rc < 0) {
1097                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1098                 exit(1);
1099         }
1100
1101         vector<USBCardDevice> found_cards = find_all_cards();
1102         unsigned ret = found_cards.size();
1103         for (size_t i = 0; i < found_cards.size(); ++i) {
1104                 libusb_unref_device(found_cards[i].device);
1105         }
1106         return ret;
1107 }
1108
1109 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1110 {
1111         current_pixel_format = pixel_format;
1112         update_capture_mode();
1113 }
1114
1115 void BMUSBCapture::configure_card()
1116 {
1117         if (video_frame_allocator == nullptr) {
1118                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1119                 set_video_frame_allocator(owned_video_frame_allocator.get());
1120         }
1121         if (audio_frame_allocator == nullptr) {
1122                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1123                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1124         }
1125         dequeue_thread_should_quit = false;
1126         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1127
1128         int rc;
1129         struct libusb_transfer *xfr;
1130
1131         rc = libusb_init(nullptr);
1132         if (rc < 0) {
1133                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1134                 exit(1);
1135         }
1136
1137         if (dev == nullptr) {
1138                 devh = open_card(card_index, &description);
1139         } else {
1140                 devh = open_card(card_index, dev, &description);
1141                 libusb_unref_device(dev);
1142         }
1143         if (!devh) {
1144                 fprintf(stderr, "Error finding USB device\n");
1145                 exit(1);
1146         }
1147
1148         libusb_config_descriptor *config;
1149         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1150         if (rc < 0) {
1151                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1152                 exit(1);
1153         }
1154
1155 #if 0
1156         printf("%d interface\n", config->bNumInterfaces);
1157         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1158                 printf("  interface %d\n", interface_number);
1159                 const libusb_interface *interface = &config->interface[interface_number];
1160                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1161                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1162                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1163                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1164                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1165                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1166                         }
1167                 }
1168         }
1169 #endif
1170
1171         rc = libusb_set_configuration(devh, /*configuration=*/1);
1172         if (rc < 0) {
1173                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1174                 exit(1);
1175         }
1176
1177         rc = libusb_claim_interface(devh, 0);
1178         if (rc < 0) {
1179                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1180                 exit(1);
1181         }
1182
1183         // Alternate setting 1 is output, alternate setting 2 is input.
1184         // Card is reset when switching alternates, so the driver uses
1185         // this “double switch” when it wants to reset.
1186         //
1187         // There's also alternate settings 3 and 4, which seem to be
1188         // like 1 and 2 except they advertise less bandwidth needed.
1189         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1190         if (rc < 0) {
1191                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1192                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1193                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1194                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1195                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1196                 }
1197                 exit(1);
1198         }
1199         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1200         if (rc < 0) {
1201                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1202                 exit(1);
1203         }
1204 #if 0
1205         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1206         if (rc < 0) {
1207                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1208                 exit(1);
1209         }
1210 #endif
1211
1212 #if 0
1213         rc = libusb_claim_interface(devh, 3);
1214         if (rc < 0) {
1215                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1216                 exit(1);
1217         }
1218 #endif
1219
1220         // theories:
1221         //   44 is some kind of timer register (first 16 bits count upwards)
1222         //   24 is some sort of watchdog?
1223         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1224         //      (or will go to 0x73c60010?), also seen 0x73c60100
1225         //   12 also changes all the time, unclear why  
1226         //   16 seems to be autodetected mode somehow
1227         //      --    this is e00115e0 after reset?
1228         //                    ed0115e0 after mode change [to output?]
1229         //                    2d0015e0 after more mode change [to input]
1230         //                    ed0115e0 after more mode change
1231         //                    2d0015e0 after more mode change
1232         //
1233         //                    390115e0 seems to indicate we have signal
1234         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1235         //
1236         //                    200015e0 on startup
1237         //         changes to 250115e0 when we sync to the signal
1238         //
1239         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1240         //
1241         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1242         //
1243         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1244         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1245         //
1246         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1247         //    perhaps some of them are related to analog output?
1248         //
1249         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1250         //    but the driver sets it to 0x8036802a at some point.
1251         //
1252         //    all of this is on request 214/215. other requests (192, 219,
1253         //    222, 223, 224) are used for firmware upgrade. Probably best to
1254         //    stay out of it unless you know what you're doing.
1255         //
1256         //
1257         // register 16:
1258         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1259         //
1260         // theories:
1261         //   0x01 - stable signal
1262         //   0x04 - deep color
1263         //   0x08 - unknown (audio??)
1264         //   0x20 - 720p??
1265         //   0x30 - 576p??
1266
1267         update_capture_mode();
1268
1269         struct ctrl {
1270                 int endpoint;
1271                 int request;
1272                 int index;
1273                 uint32_t data;
1274         };
1275         static const ctrl ctrls[] = {
1276                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1277                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1278
1279                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1280                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1281                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1282                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1283         };
1284
1285         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1286                 uint32_t flipped = htonl(ctrls[req].data);
1287                 static uint8_t value[4];
1288                 memcpy(value, &flipped, sizeof(flipped));
1289                 int size = sizeof(value);
1290                 //if (ctrls[req].request == 215) size = 0;
1291                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1292                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1293                 if (rc < 0) {
1294                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1295                         exit(1);
1296                 }
1297
1298                 if (ctrls[req].index == 16 && rc == 4) {
1299                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1300                 }
1301
1302 #if 0
1303                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1304                 for (int i = 0; i < rc; ++i) {
1305                         printf("%02x", value[i]);
1306                 }
1307                 printf("\n");
1308 #endif
1309         }
1310
1311 #if 0
1312         // DEBUG
1313         for ( ;; ) {
1314                 static int my_index = 0;
1315                 static uint8_t value[4];
1316                 int size = sizeof(value);
1317                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1318                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1319                 if (rc < 0) {
1320                         fprintf(stderr, "Error on control\n");
1321                         exit(1);
1322                 }
1323                 printf("rc=%d index=%d: 0x", rc, my_index);
1324                 for (int i = 0; i < rc; ++i) {
1325                         printf("%02x", value[i]);
1326                 }
1327                 printf("\n");
1328         }
1329 #endif
1330
1331 #if 0
1332         // set up an asynchronous transfer of the timer register
1333         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1334         static int completed = 0;
1335
1336         xfr = libusb_alloc_transfer(0);
1337         libusb_fill_control_setup(cmdbuf,
1338             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1339                 /*index=*/44, /*length=*/4);
1340         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1341         xfr->user_data = this;
1342         libusb_submit_transfer(xfr);
1343
1344         // set up an asynchronous transfer of register 24
1345         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1346         static int completed2 = 0;
1347
1348         xfr = libusb_alloc_transfer(0);
1349         libusb_fill_control_setup(cmdbuf2,
1350             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1351                 /*index=*/24, /*length=*/4);
1352         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1353         xfr->user_data = this;
1354         libusb_submit_transfer(xfr);
1355 #endif
1356
1357         // set up an asynchronous transfer of the register dump
1358         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1359         static int completed3 = 0;
1360
1361         xfr = libusb_alloc_transfer(0);
1362         libusb_fill_control_setup(cmdbuf3,
1363             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1364                 /*index=*/current_register, /*length=*/4);
1365         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1366         xfr->user_data = this;
1367         //libusb_submit_transfer(xfr);
1368
1369         //audiofp = fopen("audio.raw", "wb");
1370
1371         // set up isochronous transfers for audio and video
1372         for (int e = 3; e <= 4; ++e) {
1373                 int num_transfers = 6;
1374                 for (int i = 0; i < num_transfers; ++i) {
1375                         size_t buf_size;
1376                         int num_iso_pack, size;
1377                         if (e == 3) {
1378                                 // Allocate for minimum width (because that will give us the most
1379                                 // number of packets, so we don't need to reallocate, but we'll
1380                                 // default to 720p for the first frame.
1381                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1382                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1383                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1384                         } else {
1385                                 size = 0xc0;
1386                                 num_iso_pack = 80;
1387                                 buf_size = num_iso_pack * size;
1388                         }
1389                         int num_bytes = num_iso_pack * size;
1390                         assert(size_t(num_bytes) <= buf_size);
1391 #if LIBUSB_API_VERSION >= 0x01000105
1392                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1393 #else
1394                         uint8_t *buf = nullptr;
1395 #endif
1396                         if (buf == nullptr) {
1397                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1398 #if LIBUSB_API_VERSION >= 0x01000105
1399                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1400 #else
1401                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1402 #endif
1403                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1404                                 buf = new uint8_t[num_bytes];
1405                         }
1406
1407                         xfr = libusb_alloc_transfer(num_iso_pack);
1408                         if (!xfr) {
1409                                 fprintf(stderr, "oom\n");
1410                                 exit(1);
1411                         }
1412
1413                         int ep = LIBUSB_ENDPOINT_IN | e;
1414                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1415                                 num_iso_pack, cb_xfr, nullptr, 0);
1416                         libusb_set_iso_packet_lengths(xfr, size);
1417                         xfr->user_data = this;
1418
1419                         if (e == 3) {
1420                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1421                         }
1422
1423                         iso_xfrs.push_back(xfr);
1424                 }
1425         }
1426 }
1427
1428 void BMUSBCapture::start_bm_capture()
1429 {
1430         int i = 0;
1431         for (libusb_transfer *xfr : iso_xfrs) {
1432                 int rc = libusb_submit_transfer(xfr);
1433                 ++i;
1434                 if (rc < 0) {
1435                         //printf("num_bytes=%d\n", num_bytes);
1436                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1437                                 xfr->endpoint, i, libusb_error_name(rc));
1438                         exit(1);
1439                 }
1440         }
1441
1442
1443 #if 0
1444         libusb_release_interface(devh, 0);
1445 out:
1446         if (devh)
1447                 libusb_close(devh);
1448         libusb_exit(nullptr);
1449         return rc;
1450 #endif
1451 }
1452
1453 void BMUSBCapture::stop_dequeue_thread()
1454 {
1455         dequeue_thread_should_quit = true;
1456         queues_not_empty.notify_all();
1457         dequeue_thread.join();
1458 }
1459
1460 void BMUSBCapture::start_bm_thread()
1461 {
1462         // Devices leaving are discovered by seeing the isochronous packets
1463         // coming back with errors, so only care about devices joining.
1464         if (card_connected_callback != nullptr) {
1465                 if (libusb_hotplug_register_callback(
1466                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1467                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1468                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1469                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1470                         exit(1);
1471                 }
1472         }
1473
1474         should_quit = false;
1475         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1476 }
1477
1478 void BMUSBCapture::stop_bm_thread()
1479 {
1480         should_quit = true;
1481         usb_thread.join();
1482 }
1483
1484 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1485 {
1486         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1487         VideoMode auto_mode;
1488         auto_mode.name = "Autodetect";
1489         auto_mode.autodetect = true;
1490         return {{ 0, auto_mode }};
1491 }
1492
1493 uint32_t BMUSBCapture::get_current_video_mode() const
1494 {
1495         return 0;  // Matches get_available_video_modes().
1496 }
1497
1498 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1499 {
1500         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1501 }
1502
1503 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1504 {
1505         return {
1506                 { 0x00000000, "HDMI/SDI" },
1507                 { 0x02000000, "Component" },
1508                 { 0x04000000, "Composite" },
1509                 { 0x06000000, "S-video" }
1510         };
1511 }
1512
1513 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1514 {
1515         assert((video_input_id & ~0x06000000) == 0);
1516         current_video_input = video_input_id;
1517         update_capture_mode();
1518 }
1519
1520 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1521 {
1522         return {
1523                 { 0x00000000, "Embedded" },
1524                 { 0x10000000, "Analog" }
1525         };
1526 }
1527
1528 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1529 {
1530         assert((audio_input_id & ~0x10000000) == 0);
1531         current_audio_input = audio_input_id;
1532         update_capture_mode();
1533 }
1534
1535 void BMUSBCapture::update_capture_mode()
1536 {
1537         if (devh == nullptr) {
1538                 return;
1539         }
1540
1541         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1542         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1543         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1544                 mode |= htonl(0x20000000);
1545         } else {
1546                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1547         }
1548
1549         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1550                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1551         if (rc < 0) {
1552                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1553                 exit(1);
1554         }
1555 }
1556
1557 }  // namespace bmusb