]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Release 0.7.7 (no code changes, Makefile only).
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.7.2
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0151,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
203                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
204                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
205                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
206                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
207                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
208                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
209                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
210                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
211                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
212                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
213                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
214                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
215                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
216         };
217         for (const VideoFormatEntry &entry : entries) {
218                 if (normalized_video_format == entry.normalized_video_format) {
219                         decoded_video_format->width = entry.width;
220                         decoded_video_format->height = entry.height;
221                         if (video_format & 0x0800) {
222                                 decoded_video_format->stride = entry.width * 2;
223                         } else {
224                                 decoded_video_format->stride = v210_stride(entry.width);
225                         }
226                         decoded_video_format->second_field_start = entry.second_field_start;
227                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
228                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
229                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
230                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
231                         decoded_video_format->interlaced = entry.interlaced;
232                         return true;
233                 }
234         }
235
236         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
237         decoded_video_format->width = 1280;
238         decoded_video_format->height = 720;
239         decoded_video_format->stride = 1280 * 2;
240         decoded_video_format->frame_rate_nom = 60;
241         decoded_video_format->frame_rate_den = 1;
242         return false;
243 }
244
245 // There are seemingly no direct indicators of sample rate; you just get
246 // one frame's worth and have to guess from that.
247 int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_rate)
248 {
249         size_t num_samples = len / 3 / 8;
250         size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
251
252         // See if we match or are very close to any of the mandatory HDMI sample rates.
253         const int candidate_sample_rates[] = { 32000, 44100, 48000 };
254         for (int rate : candidate_sample_rates) {
255                 if (abs(int(num_samples_per_second) - rate) <= 100) {
256                         return rate;
257                 }
258         }
259
260         fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, keeping capture at %d Hz\n",
261                 num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second, default_rate);
262         return default_rate;
263 }
264
265 }  // namespace
266
267 FrameAllocator::~FrameAllocator() {}
268
269 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
270         : frame_size(frame_size)
271 {
272         for (size_t i = 0; i < num_queued_frames; ++i) {
273                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
274         }
275 }
276
277 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
278 {
279         Frame vf;
280         vf.owner = this;
281
282         unique_lock<mutex> lock(freelist_mutex);  // Meh.
283         if (freelist.empty()) {
284                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
285                         frame_size);
286         } else {
287                 vf.data = freelist.top().release();
288                 vf.size = frame_size;
289                 freelist.pop();  // Meh.
290         }
291         return vf;
292 }
293
294 void MallocFrameAllocator::release_frame(Frame frame)
295 {
296         if (frame.overflow > 0) {
297                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
298         }
299         unique_lock<mutex> lock(freelist_mutex);
300         freelist.push(unique_ptr<uint8_t[]>(frame.data));
301 }
302
303 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
304 {
305         if (a == b) {
306                 return false;
307         } else if (a < b) {
308                 return (b - a < 0x8000);
309         } else {
310                 int wrap_b = 0x10000 + int(b);
311                 return (wrap_b - a < 0x8000);
312         }
313 }
314
315 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
316 {
317         unique_lock<mutex> lock(queue_lock);
318         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
319                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
320                         q->back().timecode, timecode);
321                 frame.owner->release_frame(frame);
322                 return;
323         }
324
325         QueuedFrame qf;
326         qf.format = format;
327         qf.timecode = timecode;
328         qf.frame = frame;
329         q->push_back(move(qf));
330         queues_not_empty.notify_one();  // might be spurious
331 }
332
333 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
334 {
335         FILE *fp = fopen(filename, "wb");
336         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
337                 printf("short write!\n");
338         }
339         fclose(fp);
340 }
341
342 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
343 {
344         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
345 }
346
347 void BMUSBCapture::dequeue_thread_func()
348 {
349         char thread_name[16];
350         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
351         pthread_setname_np(pthread_self(), thread_name);
352
353         if (has_dequeue_callbacks) {
354                 dequeue_init_callback();
355         }
356         size_t last_sample_rate = 48000;
357         while (!dequeue_thread_should_quit) {
358                 unique_lock<mutex> lock(queue_lock);
359                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
360
361                 if (dequeue_thread_should_quit) break;
362
363                 uint16_t video_timecode = pending_video_frames.front().timecode;
364                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
365                 AudioFormat audio_format;
366                 audio_format.bits_per_sample = 24;
367                 audio_format.num_channels = 8;
368                 audio_format.sample_rate = last_sample_rate;
369                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
370                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
371                                 video_timecode);
372                         QueuedFrame video_frame = pending_video_frames.front();
373                         pending_video_frames.pop_front();
374                         lock.unlock();
375                         video_frame_allocator->release_frame(video_frame.frame);
376                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
377                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
378                                 audio_timecode);
379                         QueuedFrame audio_frame = pending_audio_frames.front();
380                         pending_audio_frames.pop_front();
381                         lock.unlock();
382                         audio_format.id = audio_frame.format;
383
384                         // Use the video format of the pending frame.
385                         QueuedFrame video_frame = pending_video_frames.front();
386                         VideoFormat video_format;
387                         decode_video_format(video_frame.format, &video_format);
388
389                         frame_callback(audio_timecode,
390                                        FrameAllocator::Frame(), 0, video_format,
391                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
392                 } else {
393                         QueuedFrame video_frame = pending_video_frames.front();
394                         QueuedFrame audio_frame = pending_audio_frames.front();
395                         pending_audio_frames.pop_front();
396                         pending_video_frames.pop_front();
397                         lock.unlock();
398
399 #if 0
400                         char filename[255];
401                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
402                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
403                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
404 #endif
405
406                         VideoFormat video_format;
407                         audio_format.id = audio_frame.format;
408                         if (decode_video_format(video_frame.format, &video_format)) {
409                                 if (audio_frame.frame.len != 0) {
410                                         audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len, last_sample_rate);
411                                         last_sample_rate = audio_format.sample_rate;
412                                 }
413                                 frame_callback(video_timecode,
414                                                video_frame.frame, HEADER_SIZE, video_format,
415                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
416                         } else {
417                                 video_frame_allocator->release_frame(video_frame.frame);
418                                 audio_format.sample_rate = last_sample_rate;
419                                 frame_callback(video_timecode,
420                                                FrameAllocator::Frame(), 0, video_format,
421                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
422                         }
423                 }
424         }
425         if (has_dequeue_callbacks) {
426                 dequeue_cleanup_callback();
427         }
428 }
429
430 void BMUSBCapture::start_new_frame(const uint8_t *start)
431 {
432         uint16_t format = (start[3] << 8) | start[2];
433         uint16_t timecode = (start[1] << 8) | start[0];
434
435         if (current_video_frame.len > 0) {
436                 current_video_frame.received_timestamp = steady_clock::now();
437
438                 // If format is 0x0800 (no signal), add a fake (empty) audio
439                 // frame to get it out of the queue.
440                 // TODO: Figure out if there are other formats that come with
441                 // no audio, and treat them the same.
442                 if (format == 0x0800) {
443                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
444                         if (fake_audio_frame.data == nullptr) {
445                                 // Oh well, it's just a no-signal frame anyway.
446                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
447                                 current_video_frame.owner->release_frame(current_video_frame);
448                                 current_video_frame = video_frame_allocator->alloc_frame();
449                                 return;
450                         }
451                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
452                 }
453                 //dump_frame();
454                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
455
456                 // Update the assumed frame width. We might be one frame too late on format changes,
457                 // but it's much better than asking the user to choose manually.
458                 VideoFormat video_format;
459                 if (decode_video_format(format, &video_format)) {
460                         assumed_frame_width = video_format.width;
461                 }
462         }
463         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
464         //      format, timecode,
465         //      //start[7], start[6], start[5], start[4],
466         //      read_current_frame, FRAME_SIZE);
467
468         current_video_frame = video_frame_allocator->alloc_frame();
469         //if (current_video_frame.data == nullptr) {
470         //      read_current_frame = -1;
471         //} else {
472         //      read_current_frame = 0;
473         //}
474 }
475
476 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
477 {
478         uint16_t format = (start[3] << 8) | start[2];
479         uint16_t timecode = (start[1] << 8) | start[0];
480         if (current_audio_frame.len > 0) {
481                 current_audio_frame.received_timestamp = steady_clock::now();
482                 //dump_audio_block();
483                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
484         }
485         //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
486         //      format, timecode);
487         current_audio_frame = audio_frame_allocator->alloc_frame();
488 }
489
490 #if 0
491 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
492 {
493         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
494         for (unsigned j = 0; j < pack->actual_length; j++) {
495         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
496                 printf("%02x", xfr->buffer[j + offset]);
497                 if ((j % 16) == 15)
498                         printf("\n");
499                 else if ((j % 8) == 7)
500                         printf("  ");
501                 else
502                         printf(" ");
503         }
504 }
505 #endif
506
507 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
508 {
509         assert(n % 2 == 0);
510         uint8_t *dptr1 = dest1;
511         uint8_t *dptr2 = dest2;
512
513         for (size_t i = 0; i < n; i += 2) {
514                 *dptr1++ = *src++;
515                 *dptr2++ = *src++;
516         }
517 }
518
519 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
520 {
521         if (current_frame->data == nullptr ||
522             current_frame->len > current_frame->size ||
523             start == end) {
524                 return;
525         }
526
527         int bytes = end - start;
528         if (current_frame->len + bytes > current_frame->size) {
529                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
530                 current_frame->len = current_frame->size;
531                 if (current_frame->overflow > 1048576) {
532                         printf("%d bytes overflow after last %s frame\n",
533                                 int(current_frame->overflow), frame_type_name);
534                         current_frame->overflow = 0;
535                 }
536                 //dump_frame();
537         } else {
538                 if (current_frame->interleaved) {
539                         uint8_t *data = current_frame->data + current_frame->len / 2;
540                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
541                         if (current_frame->len % 2 == 1) {
542                                 ++data;
543                                 swap(data, data2);
544                         }
545                         if (bytes % 2 == 1) {
546                                 *data++ = *start++;
547                                 swap(data, data2);
548                                 ++current_frame->len;
549                                 --bytes;
550                         }
551                         memcpy_interleaved(data, data2, start, bytes);
552                         current_frame->len += bytes;
553                 } else {
554                         memcpy(current_frame->data + current_frame->len, start, bytes);
555                         current_frame->len += bytes;
556                 }
557         }
558 }
559
560 #if 0
561 void avx2_dump(const char *name, __m256i n)
562 {
563         printf("%-10s:", name);
564         printf(" %02x", _mm256_extract_epi8(n, 0));
565         printf(" %02x", _mm256_extract_epi8(n, 1));
566         printf(" %02x", _mm256_extract_epi8(n, 2));
567         printf(" %02x", _mm256_extract_epi8(n, 3));
568         printf(" %02x", _mm256_extract_epi8(n, 4));
569         printf(" %02x", _mm256_extract_epi8(n, 5));
570         printf(" %02x", _mm256_extract_epi8(n, 6));
571         printf(" %02x", _mm256_extract_epi8(n, 7));
572         printf(" ");
573         printf(" %02x", _mm256_extract_epi8(n, 8));
574         printf(" %02x", _mm256_extract_epi8(n, 9));
575         printf(" %02x", _mm256_extract_epi8(n, 10));
576         printf(" %02x", _mm256_extract_epi8(n, 11));
577         printf(" %02x", _mm256_extract_epi8(n, 12));
578         printf(" %02x", _mm256_extract_epi8(n, 13));
579         printf(" %02x", _mm256_extract_epi8(n, 14));
580         printf(" %02x", _mm256_extract_epi8(n, 15));
581         printf(" ");
582         printf(" %02x", _mm256_extract_epi8(n, 16));
583         printf(" %02x", _mm256_extract_epi8(n, 17));
584         printf(" %02x", _mm256_extract_epi8(n, 18));
585         printf(" %02x", _mm256_extract_epi8(n, 19));
586         printf(" %02x", _mm256_extract_epi8(n, 20));
587         printf(" %02x", _mm256_extract_epi8(n, 21));
588         printf(" %02x", _mm256_extract_epi8(n, 22));
589         printf(" %02x", _mm256_extract_epi8(n, 23));
590         printf(" ");
591         printf(" %02x", _mm256_extract_epi8(n, 24));
592         printf(" %02x", _mm256_extract_epi8(n, 25));
593         printf(" %02x", _mm256_extract_epi8(n, 26));
594         printf(" %02x", _mm256_extract_epi8(n, 27));
595         printf(" %02x", _mm256_extract_epi8(n, 28));
596         printf(" %02x", _mm256_extract_epi8(n, 29));
597         printf(" %02x", _mm256_extract_epi8(n, 30));
598         printf(" %02x", _mm256_extract_epi8(n, 31));
599         printf("\n");
600 }
601 #endif
602
603 #ifndef HAS_MULTIVERSIONING
604
605 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
606 {
607         // No fast path possible unless we have multiversioning.
608         return start;
609 }
610
611 #else  // defined(HAS_MULTIVERSIONING)
612
613 __attribute__((target("sse4.1")))
614 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
615
616 __attribute__((target("avx2")))
617 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
618
619 // Does a memcpy and memchr in one to reduce processing time.
620 // Note that the benefit is somewhat limited if your L3 cache is small,
621 // as you'll (unfortunately) spend most of the time loading the data
622 // from main memory.
623 //
624 // Complicated cases are left to the slow path; it basically stops copying
625 // up until the first instance of "sync_char" (usually a bit before, actually).
626 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
627 // data, and what we really need this for is the 00 00 ff ff marker in video data.
628 __attribute__((target("default")))
629 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
630 {
631         // No fast path possible unless we have SSE 4.1 or higher.
632         return start;
633 }
634
635 __attribute__((target("sse4.1", "avx2")))
636 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
637 {
638         if (current_frame->data == nullptr ||
639             current_frame->len > current_frame->size ||
640             start == limit) {
641                 return start;
642         }
643         size_t orig_bytes = limit - start;
644         if (orig_bytes < 128) {
645                 // Don't bother.
646                 return start;
647         }
648
649         // Don't read more bytes than we can write.
650         limit = min(limit, start + (current_frame->size - current_frame->len));
651
652         // Align end to 32 bytes.
653         limit = (const uint8_t *)(intptr_t(limit) & ~31);
654
655         if (start >= limit) {
656                 return start;
657         }
658
659         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
660         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
661         if (aligned_start != start) {
662                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
663                 if (sync_start == nullptr) {
664                         add_to_frame(current_frame, "", start, aligned_start);
665                 } else {
666                         add_to_frame(current_frame, "", start, sync_start);
667                         return sync_start;
668                 }
669         }
670
671         // Make the length a multiple of 64.
672         if (current_frame->interleaved) {
673                 if (((limit - aligned_start) % 64) != 0) {
674                         limit -= 32;
675                 }
676                 assert(((limit - aligned_start) % 64) == 0);
677         }
678
679         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
680 }
681
682 __attribute__((target("avx2")))
683 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
684 {
685         const __m256i needle = _mm256_set1_epi8(sync_char);
686
687         const __restrict __m256i *in = (const __m256i *)aligned_start;
688         if (current_frame->interleaved) {
689                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
690                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
691                 if (current_frame->len % 2 == 1) {
692                         swap(out1, out2);
693                 }
694
695                 __m256i shuffle_cw = _mm256_set_epi8(
696                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
697                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
698                 while (in < (const __m256i *)limit) {
699                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
700                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
701                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
702
703                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
704                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
705                         __m256i found = _mm256_or_si256(found1, found2);
706
707                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
708                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
709                 
710                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
711                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
712
713                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
714                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
715
716                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
717                         _mm256_storeu_si256(out2, hi);
718
719                         if (!_mm256_testz_si256(found, found)) {
720                                 break;
721                         }
722
723                         in += 2;
724                         ++out1;
725                         ++out2;
726                 }
727                 current_frame->len += (uint8_t *)in - aligned_start;
728         } else {
729                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
730                 while (in < (const __m256i *)limit) {
731                         __m256i data = _mm256_load_si256(in);
732                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
733                         __m256i found = _mm256_cmpeq_epi8(data, needle);
734                         if (!_mm256_testz_si256(found, found)) {
735                                 break;
736                         }
737
738                         ++in;
739                         ++out;
740                 }
741                 current_frame->len = (uint8_t *)out - current_frame->data;
742         }
743
744         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
745         return (const uint8_t *)in;
746 }
747
748 __attribute__((target("sse4.1")))
749 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
750 {
751         const __m128i needle = _mm_set1_epi8(sync_char);
752
753         const __m128i *in = (const __m128i *)aligned_start;
754         if (current_frame->interleaved) {
755                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
756                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
757                 if (current_frame->len % 2 == 1) {
758                         swap(out1, out2);
759                 }
760
761                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
762                 while (in < (const __m128i *)limit) {
763                         __m128i data1 = _mm_load_si128(in);
764                         __m128i data2 = _mm_load_si128(in + 1);
765                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
766                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
767                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
768                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
769                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
770                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
771                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
772                         _mm_storeu_si128(out2, hi);
773                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
774                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
775                         if (!_mm_testz_si128(found1, found1) ||
776                             !_mm_testz_si128(found2, found2)) {
777                                 break;
778                         }
779
780                         in += 2;
781                         ++out1;
782                         ++out2;
783                 }
784                 current_frame->len += (uint8_t *)in - aligned_start;
785         } else {
786                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
787                 while (in < (const __m128i *)limit) {
788                         __m128i data = _mm_load_si128(in);
789                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
790                         __m128i found = _mm_cmpeq_epi8(data, needle);
791                         if (!_mm_testz_si128(found, found)) {
792                                 break;
793                         }
794
795                         ++in;
796                         ++out;
797                 }
798                 current_frame->len = (uint8_t *)out - current_frame->data;
799         }
800
801         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
802         return (const uint8_t *)in;
803 }
804
805 #endif  // defined(HAS_MULTIVERSIONING)
806
807 void decode_packs(const libusb_transfer *xfr,
808                   const char *sync_pattern,
809                   int sync_length,
810                   FrameAllocator::Frame *current_frame,
811                   const char *frame_type_name,
812                   function<void(const uint8_t *start)> start_callback)
813 {
814         int offset = 0;
815         for (int i = 0; i < xfr->num_iso_packets; i++) {
816                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
817
818                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
819                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
820                         continue;
821 //exit(5);
822                 }
823
824                 const uint8_t *start = xfr->buffer + offset;
825                 const uint8_t *limit = start + pack->actual_length;
826                 while (start < limit) {  // Usually runs only one iteration.
827                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
828                         if (start == limit) break;
829                         assert(start < limit);
830
831                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
832                         if (start_next_frame == nullptr) {
833                                 // add the rest of the buffer
834                                 add_to_frame(current_frame, frame_type_name, start, limit);
835                                 break;
836                         } else {
837                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
838                                 start = start_next_frame + sync_length;  // skip sync
839                                 start_callback(start);
840                         }
841                 }
842 #if 0
843                 dump_pack(xfr, offset, pack);
844 #endif
845                 offset += pack->length;
846         }
847 }
848
849 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
850 {
851         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
852             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
853                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
854                 libusb_free_transfer(xfr);
855                 exit(3);
856         }
857
858         assert(xfr->user_data != nullptr);
859         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
860
861         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
862                 if (!usb->disconnected) {
863                         fprintf(stderr, "Device went away, stopping transfers.\n");
864                         usb->disconnected = true;
865                         if (usb->card_disconnected_callback) {
866                                 usb->card_disconnected_callback();
867                         }
868                 }
869                 // Don't reschedule the transfer; the loop will stop by itself.
870                 return;
871         }
872
873         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
874                 if (xfr->endpoint == 0x84) {
875                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
876                 } else {
877                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
878
879                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
880                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
881                 }
882         }
883         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
884                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
885                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
886 #if 0
887                 if (setup->wIndex == 44) {
888                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
889                 } else {
890                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
891                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
892                 }
893 #else
894                 memcpy(usb->register_file + usb->current_register, buf, 4);
895                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
896                 if (usb->current_register == 0) {
897                         // read through all of them
898                         printf("register dump:");
899                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
900                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
901                         }
902                         printf("\n");
903                 }
904                 libusb_fill_control_setup(xfr->buffer,
905                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
906                         /*index=*/usb->current_register, /*length=*/4);
907 #endif
908         }
909
910 #if 0
911         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
912         for (i = 0; i < xfr->actual_length; i++) {
913                 printf("%02x", xfr->buffer[i]);
914                 if (i % 16)
915                         printf("\n");
916                 else if (i % 8)
917                         printf("  ");
918                 else
919                         printf(" ");
920         }
921 #endif
922
923         int rc = libusb_submit_transfer(xfr);
924         if (rc < 0) {
925                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
926                 exit(1);
927         }
928 }
929
930 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
931 {
932         if (card_connected_callback != nullptr) {
933                 libusb_device_descriptor desc;
934                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
935                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
936                         libusb_unref_device(dev);
937                         return 1;
938                 }
939
940                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
941                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
942                         card_connected_callback(dev);  // Callback takes ownership.
943                         return 0;
944                 }
945         }
946         libusb_unref_device(dev);
947         return 0;
948 }
949
950 void BMUSBCapture::usb_thread_func()
951 {
952         sched_param param;
953         memset(&param, 0, sizeof(param));
954         param.sched_priority = 1;
955         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
956                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
957         }
958         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
959         while (!should_quit) {
960                 timeval sec { 1, 0 };
961                 int rc = libusb_handle_events_timeout(nullptr, &sec);
962                 if (rc != LIBUSB_SUCCESS)
963                         break;
964         }
965 }
966
967 namespace {
968
969 struct USBCardDevice {
970         uint16_t product;
971         uint8_t bus, port;
972         libusb_device *device;
973 };
974
975 const char *get_product_name(uint16_t product)
976 {
977         if (product == 0xbd3b) {
978                 return "Intensity Shuttle";
979         } else if (product == 0xbd4f) {
980                 return "UltraStudio SDI";
981         } else {
982                 assert(false);
983                 return nullptr;
984         }
985 }
986
987 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
988 {
989         const char *product_name = get_product_name(product);
990
991         char buf[256];
992         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
993                 id, bus, port, product_name);
994         return buf;
995 }
996
997 vector<USBCardDevice> find_all_cards()
998 {
999         libusb_device **devices;
1000         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
1001         if (num_devices == -1) {
1002                 fprintf(stderr, "Error finding USB devices\n");
1003                 exit(1);
1004         }
1005         vector<USBCardDevice> found_cards;
1006         for (ssize_t i = 0; i < num_devices; ++i) {
1007                 libusb_device_descriptor desc;
1008                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
1009                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
1010                         exit(1);
1011                 }
1012
1013                 uint8_t bus = libusb_get_bus_number(devices[i]);
1014                 uint8_t port = libusb_get_port_number(devices[i]);
1015
1016                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
1017                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
1018                         libusb_unref_device(devices[i]);
1019                         continue;
1020                 }
1021
1022                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
1023         }
1024         libusb_free_device_list(devices, 0);
1025
1026         // Sort the devices to get a consistent ordering.
1027         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
1028                 if (a.product != b.product)
1029                         return a.product < b.product;
1030                 if (a.bus != b.bus)
1031                         return a.bus < b.bus;
1032                 return a.port < b.port;
1033         });
1034
1035         return found_cards;
1036 }
1037
1038 libusb_device_handle *open_card(int card_index, string *description)
1039 {
1040         vector<USBCardDevice> found_cards = find_all_cards();
1041
1042         for (size_t i = 0; i < found_cards.size(); ++i) {
1043                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1044                 fprintf(stderr, "%s\n", tmp_description.c_str());
1045                 if (i == size_t(card_index)) {
1046                         *description = tmp_description;
1047                 }
1048         }
1049
1050         if (size_t(card_index) >= found_cards.size()) {
1051                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1052                 exit(1);
1053         }
1054
1055         libusb_device_handle *devh;
1056         int rc = libusb_open(found_cards[card_index].device, &devh);
1057         if (rc < 0) {
1058                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1059                 exit(1);
1060         }
1061
1062         for (size_t i = 0; i < found_cards.size(); ++i) {
1063                 libusb_unref_device(found_cards[i].device);
1064         }
1065
1066         return devh;
1067 }
1068
1069 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1070 {
1071         uint8_t bus = libusb_get_bus_number(dev);
1072         uint8_t port = libusb_get_port_number(dev);
1073
1074         libusb_device_descriptor desc;
1075         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1076                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1077                 exit(1);
1078         }
1079
1080         *description = get_card_description(card_index, bus, port, desc.idProduct);
1081
1082         libusb_device_handle *devh;
1083         int rc = libusb_open(dev, &devh);
1084         if (rc < 0) {
1085                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1086                 exit(1);
1087         }
1088
1089         return devh;
1090 }
1091
1092 }  // namespace
1093
1094 unsigned BMUSBCapture::num_cards()
1095 {
1096         int rc = libusb_init(nullptr);
1097         if (rc < 0) {
1098                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1099                 exit(1);
1100         }
1101
1102         vector<USBCardDevice> found_cards = find_all_cards();
1103         unsigned ret = found_cards.size();
1104         for (size_t i = 0; i < found_cards.size(); ++i) {
1105                 libusb_unref_device(found_cards[i].device);
1106         }
1107         return ret;
1108 }
1109
1110 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1111 {
1112         current_pixel_format = pixel_format;
1113         update_capture_mode();
1114 }
1115
1116 void BMUSBCapture::configure_card()
1117 {
1118         if (video_frame_allocator == nullptr) {
1119                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1120                 set_video_frame_allocator(owned_video_frame_allocator.get());
1121         }
1122         if (audio_frame_allocator == nullptr) {
1123                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1124                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1125         }
1126         dequeue_thread_should_quit = false;
1127         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1128
1129         int rc;
1130         struct libusb_transfer *xfr;
1131
1132         rc = libusb_init(nullptr);
1133         if (rc < 0) {
1134                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1135                 exit(1);
1136         }
1137
1138         if (dev == nullptr) {
1139                 devh = open_card(card_index, &description);
1140         } else {
1141                 devh = open_card(card_index, dev, &description);
1142                 libusb_unref_device(dev);
1143         }
1144         if (!devh) {
1145                 fprintf(stderr, "Error finding USB device\n");
1146                 exit(1);
1147         }
1148
1149         libusb_config_descriptor *config;
1150         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1151         if (rc < 0) {
1152                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1153                 exit(1);
1154         }
1155
1156 #if 0
1157         printf("%d interface\n", config->bNumInterfaces);
1158         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1159                 printf("  interface %d\n", interface_number);
1160                 const libusb_interface *interface = &config->interface[interface_number];
1161                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1162                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1163                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1164                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1165                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1166                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1167                         }
1168                 }
1169         }
1170 #endif
1171
1172         rc = libusb_set_configuration(devh, /*configuration=*/1);
1173         if (rc < 0) {
1174                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1175                 exit(1);
1176         }
1177
1178         rc = libusb_claim_interface(devh, 0);
1179         if (rc < 0) {
1180                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1181                 exit(1);
1182         }
1183
1184         // Alternate setting 1 is output, alternate setting 2 is input.
1185         // Card is reset when switching alternates, so the driver uses
1186         // this “double switch” when it wants to reset.
1187         //
1188         // There's also alternate settings 3 and 4, which seem to be
1189         // like 1 and 2 except they advertise less bandwidth needed.
1190         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1191         if (rc < 0) {
1192                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1193                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1194                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1195                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1196                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1197                 }
1198                 exit(1);
1199         }
1200         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1201         if (rc < 0) {
1202                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1203                 exit(1);
1204         }
1205 #if 0
1206         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1207         if (rc < 0) {
1208                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1209                 exit(1);
1210         }
1211 #endif
1212
1213 #if 0
1214         rc = libusb_claim_interface(devh, 3);
1215         if (rc < 0) {
1216                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1217                 exit(1);
1218         }
1219 #endif
1220
1221         // theories:
1222         //   44 is some kind of timer register (first 16 bits count upwards)
1223         //   24 is some sort of watchdog?
1224         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1225         //      (or will go to 0x73c60010?), also seen 0x73c60100
1226         //   12 also changes all the time, unclear why  
1227         //   16 seems to be autodetected mode somehow
1228         //      --    this is e00115e0 after reset?
1229         //                    ed0115e0 after mode change [to output?]
1230         //                    2d0015e0 after more mode change [to input]
1231         //                    ed0115e0 after more mode change
1232         //                    2d0015e0 after more mode change
1233         //
1234         //                    390115e0 seems to indicate we have signal
1235         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1236         //
1237         //                    200015e0 on startup
1238         //         changes to 250115e0 when we sync to the signal
1239         //
1240         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1241         //
1242         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1243         //
1244         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1245         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1246         //
1247         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1248         //    perhaps some of them are related to analog output?
1249         //
1250         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1251         //    but the driver sets it to 0x8036802a at some point.
1252         //
1253         //    all of this is on request 214/215. other requests (192, 219,
1254         //    222, 223, 224) are used for firmware upgrade. Probably best to
1255         //    stay out of it unless you know what you're doing.
1256         //
1257         //
1258         // register 16:
1259         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1260         //
1261         // theories:
1262         //   0x01 - stable signal
1263         //   0x04 - deep color
1264         //   0x08 - unknown (audio??)
1265         //   0x20 - 720p??
1266         //   0x30 - 576p??
1267
1268         update_capture_mode();
1269
1270         struct ctrl {
1271                 int endpoint;
1272                 int request;
1273                 int index;
1274                 uint32_t data;
1275         };
1276         static const ctrl ctrls[] = {
1277                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1278                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1279
1280                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1281                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1282                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1283                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1284         };
1285
1286         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1287                 uint32_t flipped = htonl(ctrls[req].data);
1288                 static uint8_t value[4];
1289                 memcpy(value, &flipped, sizeof(flipped));
1290                 int size = sizeof(value);
1291                 //if (ctrls[req].request == 215) size = 0;
1292                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1293                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1294                 if (rc < 0) {
1295                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1296                         exit(1);
1297                 }
1298
1299                 if (ctrls[req].index == 16 && rc == 4) {
1300                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1301                 }
1302
1303 #if 0
1304                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1305                 for (int i = 0; i < rc; ++i) {
1306                         printf("%02x", value[i]);
1307                 }
1308                 printf("\n");
1309 #endif
1310         }
1311
1312 #if 0
1313         // DEBUG
1314         for ( ;; ) {
1315                 static int my_index = 0;
1316                 static uint8_t value[4];
1317                 int size = sizeof(value);
1318                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1319                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1320                 if (rc < 0) {
1321                         fprintf(stderr, "Error on control\n");
1322                         exit(1);
1323                 }
1324                 printf("rc=%d index=%d: 0x", rc, my_index);
1325                 for (int i = 0; i < rc; ++i) {
1326                         printf("%02x", value[i]);
1327                 }
1328                 printf("\n");
1329         }
1330 #endif
1331
1332 #if 0
1333         // set up an asynchronous transfer of the timer register
1334         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1335         static int completed = 0;
1336
1337         xfr = libusb_alloc_transfer(0);
1338         libusb_fill_control_setup(cmdbuf,
1339             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1340                 /*index=*/44, /*length=*/4);
1341         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1342         xfr->user_data = this;
1343         libusb_submit_transfer(xfr);
1344
1345         // set up an asynchronous transfer of register 24
1346         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1347         static int completed2 = 0;
1348
1349         xfr = libusb_alloc_transfer(0);
1350         libusb_fill_control_setup(cmdbuf2,
1351             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1352                 /*index=*/24, /*length=*/4);
1353         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1354         xfr->user_data = this;
1355         libusb_submit_transfer(xfr);
1356 #endif
1357
1358         // set up an asynchronous transfer of the register dump
1359         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1360         static int completed3 = 0;
1361
1362         xfr = libusb_alloc_transfer(0);
1363         libusb_fill_control_setup(cmdbuf3,
1364             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1365                 /*index=*/current_register, /*length=*/4);
1366         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1367         xfr->user_data = this;
1368         //libusb_submit_transfer(xfr);
1369
1370         //audiofp = fopen("audio.raw", "wb");
1371
1372         // set up isochronous transfers for audio and video
1373         for (int e = 3; e <= 4; ++e) {
1374                 int num_transfers = 6;
1375                 for (int i = 0; i < num_transfers; ++i) {
1376                         size_t buf_size;
1377                         int num_iso_pack, size;
1378                         if (e == 3) {
1379                                 // Allocate for minimum width (because that will give us the most
1380                                 // number of packets, so we don't need to reallocate, but we'll
1381                                 // default to 720p for the first frame.
1382                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1383                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1384                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1385                         } else {
1386                                 size = 0xc0;
1387                                 num_iso_pack = 80;
1388                                 buf_size = num_iso_pack * size;
1389                         }
1390                         int num_bytes = num_iso_pack * size;
1391                         assert(size_t(num_bytes) <= buf_size);
1392 #if LIBUSB_API_VERSION >= 0x01000105
1393                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1394 #else
1395                         uint8_t *buf = nullptr;
1396 #endif
1397                         if (buf == nullptr) {
1398                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1399 #if LIBUSB_API_VERSION >= 0x01000105
1400                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1401 #else
1402                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1403 #endif
1404                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1405                                 buf = new uint8_t[num_bytes];
1406                         }
1407
1408                         xfr = libusb_alloc_transfer(num_iso_pack);
1409                         if (!xfr) {
1410                                 fprintf(stderr, "oom\n");
1411                                 exit(1);
1412                         }
1413
1414                         int ep = LIBUSB_ENDPOINT_IN | e;
1415                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1416                                 num_iso_pack, cb_xfr, nullptr, 0);
1417                         libusb_set_iso_packet_lengths(xfr, size);
1418                         xfr->user_data = this;
1419
1420                         if (e == 3) {
1421                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1422                         }
1423
1424                         iso_xfrs.push_back(xfr);
1425                 }
1426         }
1427 }
1428
1429 void BMUSBCapture::start_bm_capture()
1430 {
1431         int i = 0;
1432         for (libusb_transfer *xfr : iso_xfrs) {
1433                 int rc = libusb_submit_transfer(xfr);
1434                 ++i;
1435                 if (rc < 0) {
1436                         //printf("num_bytes=%d\n", num_bytes);
1437                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1438                                 xfr->endpoint, i, libusb_error_name(rc));
1439                         exit(1);
1440                 }
1441         }
1442
1443
1444 #if 0
1445         libusb_release_interface(devh, 0);
1446 out:
1447         if (devh)
1448                 libusb_close(devh);
1449         libusb_exit(nullptr);
1450         return rc;
1451 #endif
1452 }
1453
1454 void BMUSBCapture::stop_dequeue_thread()
1455 {
1456         dequeue_thread_should_quit = true;
1457         queues_not_empty.notify_all();
1458         dequeue_thread.join();
1459 }
1460
1461 void BMUSBCapture::start_bm_thread()
1462 {
1463         // Devices leaving are discovered by seeing the isochronous packets
1464         // coming back with errors, so only care about devices joining.
1465         if (card_connected_callback != nullptr) {
1466                 if (libusb_hotplug_register_callback(
1467                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1468                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1469                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1470                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1471                         exit(1);
1472                 }
1473         }
1474
1475         should_quit = false;
1476         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1477 }
1478
1479 void BMUSBCapture::stop_bm_thread()
1480 {
1481         should_quit = true;
1482         libusb_interrupt_event_handler(nullptr);
1483         usb_thread.join();
1484 }
1485
1486 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1487 {
1488         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1489         VideoMode auto_mode;
1490         auto_mode.name = "Autodetect";
1491         auto_mode.autodetect = true;
1492         return {{ 0, auto_mode }};
1493 }
1494
1495 uint32_t BMUSBCapture::get_current_video_mode() const
1496 {
1497         return 0;  // Matches get_available_video_modes().
1498 }
1499
1500 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1501 {
1502         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1503 }
1504
1505 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1506 {
1507         return {
1508                 { 0x00000000, "HDMI/SDI" },
1509                 { 0x02000000, "Component" },
1510                 { 0x04000000, "Composite" },
1511                 { 0x06000000, "S-video" }
1512         };
1513 }
1514
1515 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1516 {
1517         assert((video_input_id & ~0x06000000) == 0);
1518         current_video_input = video_input_id;
1519         update_capture_mode();
1520 }
1521
1522 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1523 {
1524         return {
1525                 { 0x00000000, "Embedded" },
1526                 { 0x10000000, "Analog" }
1527         };
1528 }
1529
1530 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1531 {
1532         assert((audio_input_id & ~0x10000000) == 0);
1533         current_audio_input = audio_input_id;
1534         update_capture_mode();
1535 }
1536
1537 void BMUSBCapture::update_capture_mode()
1538 {
1539         if (devh == nullptr) {
1540                 return;
1541         }
1542
1543         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1544         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1545         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1546                 mode |= htonl(0x20000000);
1547         } else {
1548                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1549         }
1550
1551         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1552                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1553         if (rc < 0) {
1554                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1555                 exit(1);
1556         }
1557 }
1558
1559 }  // namespace bmusb