]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Support other sample rates than 48 kHz.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.6.0
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
203                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
204                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
205                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
206                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
207                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
208                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
209                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
210                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
211                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
212                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
213                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
214                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
215         };
216         for (const VideoFormatEntry &entry : entries) {
217                 if (normalized_video_format == entry.normalized_video_format) {
218                         decoded_video_format->width = entry.width;
219                         decoded_video_format->height = entry.height;
220                         if (video_format & 0x0800) {
221                                 decoded_video_format->stride = entry.width * 2;
222                         } else {
223                                 decoded_video_format->stride = v210_stride(entry.width);
224                         }
225                         decoded_video_format->second_field_start = entry.second_field_start;
226                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
227                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
228                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
229                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
230                         decoded_video_format->interlaced = entry.interlaced;
231                         return true;
232                 }
233         }
234
235         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
236         decoded_video_format->width = 1280;
237         decoded_video_format->height = 720;
238         decoded_video_format->stride = 1280 * 2;
239         decoded_video_format->frame_rate_nom = 60;
240         decoded_video_format->frame_rate_den = 1;
241         return false;
242 }
243
244 // There are seemingly no direct indicators of sample rate; you just get
245 // one frame's worth and have to guess from that.
246 int guess_sample_rate(const VideoFormat &video_format, size_t len)
247 {
248         size_t num_samples = len / 3 / 8;
249         size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
250
251         // See if we match or are very close to any of the mandatory HDMI sample rates.
252         const int candidate_sample_rates[] = { 32000, 44100, 48000 };
253         for (int rate : candidate_sample_rates) {
254                 if (abs(int(num_samples_per_second) - rate) < 50) {
255                         return rate;
256                 }
257         }
258
259         fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, assuming 48000 Hz\n",
260                 num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second);
261         return 48000;
262 }
263
264 }  // namespace
265
266 FrameAllocator::~FrameAllocator() {}
267
268 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
269         : frame_size(frame_size)
270 {
271         for (size_t i = 0; i < num_queued_frames; ++i) {
272                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
273         }
274 }
275
276 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
277 {
278         Frame vf;
279         vf.owner = this;
280
281         unique_lock<mutex> lock(freelist_mutex);  // Meh.
282         if (freelist.empty()) {
283                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
284                         frame_size);
285         } else {
286                 vf.data = freelist.top().release();
287                 vf.size = frame_size;
288                 freelist.pop();  // Meh.
289         }
290         return vf;
291 }
292
293 void MallocFrameAllocator::release_frame(Frame frame)
294 {
295         if (frame.overflow > 0) {
296                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
297         }
298         unique_lock<mutex> lock(freelist_mutex);
299         freelist.push(unique_ptr<uint8_t[]>(frame.data));
300 }
301
302 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
303 {
304         if (a == b) {
305                 return false;
306         } else if (a < b) {
307                 return (b - a < 0x8000);
308         } else {
309                 int wrap_b = 0x10000 + int(b);
310                 return (wrap_b - a < 0x8000);
311         }
312 }
313
314 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
315 {
316         unique_lock<mutex> lock(queue_lock);
317         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
318                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
319                         q->back().timecode, timecode);
320                 frame.owner->release_frame(frame);
321                 return;
322         }
323
324         QueuedFrame qf;
325         qf.format = format;
326         qf.timecode = timecode;
327         qf.frame = frame;
328         q->push_back(move(qf));
329         queues_not_empty.notify_one();  // might be spurious
330 }
331
332 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
333 {
334         FILE *fp = fopen(filename, "wb");
335         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
336                 printf("short write!\n");
337         }
338         fclose(fp);
339 }
340
341 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
342 {
343         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
344 }
345
346 void BMUSBCapture::dequeue_thread_func()
347 {
348         char thread_name[16];
349         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
350         pthread_setname_np(pthread_self(), thread_name);
351
352         if (has_dequeue_callbacks) {
353                 dequeue_init_callback();
354         }
355         size_t last_sample_rate = 48000;
356         while (!dequeue_thread_should_quit) {
357                 unique_lock<mutex> lock(queue_lock);
358                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
359
360                 if (dequeue_thread_should_quit) break;
361
362                 uint16_t video_timecode = pending_video_frames.front().timecode;
363                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
364                 AudioFormat audio_format;
365                 audio_format.bits_per_sample = 24;
366                 audio_format.num_channels = 8;
367                 audio_format.sample_rate = last_sample_rate;
368                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
369                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
370                                 video_timecode);
371                         QueuedFrame video_frame = pending_video_frames.front();
372                         pending_video_frames.pop_front();
373                         lock.unlock();
374                         video_frame_allocator->release_frame(video_frame.frame);
375                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
376                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
377                                 audio_timecode);
378                         QueuedFrame audio_frame = pending_audio_frames.front();
379                         pending_audio_frames.pop_front();
380                         lock.unlock();
381                         audio_format.id = audio_frame.format;
382
383                         // Use the video format of the pending frame.
384                         QueuedFrame video_frame = pending_video_frames.front();
385                         VideoFormat video_format;
386                         decode_video_format(video_frame.format, &video_format);
387
388                         frame_callback(audio_timecode,
389                                        FrameAllocator::Frame(), 0, video_format,
390                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
391                 } else {
392                         QueuedFrame video_frame = pending_video_frames.front();
393                         QueuedFrame audio_frame = pending_audio_frames.front();
394                         pending_audio_frames.pop_front();
395                         pending_video_frames.pop_front();
396                         lock.unlock();
397
398 #if 0
399                         char filename[255];
400                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
401                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
402                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
403 #endif
404
405                         VideoFormat video_format;
406                         audio_format.id = audio_frame.format;
407                         if (decode_video_format(video_frame.format, &video_format)) {
408                                 if (audio_frame.frame.len != 0) {
409                                         audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len);
410                                         last_sample_rate = audio_format.sample_rate;
411                                 }
412                                 frame_callback(video_timecode,
413                                                video_frame.frame, HEADER_SIZE, video_format,
414                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
415                         } else {
416                                 audio_format.sample_rate = last_sample_rate;
417                                 frame_callback(video_timecode,
418                                                FrameAllocator::Frame(), 0, video_format,
419                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
420                         }
421                 }
422         }
423         if (has_dequeue_callbacks) {
424                 dequeue_cleanup_callback();
425         }
426 }
427
428 void BMUSBCapture::start_new_frame(const uint8_t *start)
429 {
430         uint16_t format = (start[3] << 8) | start[2];
431         uint16_t timecode = (start[1] << 8) | start[0];
432
433         if (current_video_frame.len > 0) {
434                 current_video_frame.received_timestamp = steady_clock::now();
435
436                 // If format is 0x0800 (no signal), add a fake (empty) audio
437                 // frame to get it out of the queue.
438                 // TODO: Figure out if there are other formats that come with
439                 // no audio, and treat them the same.
440                 if (format == 0x0800) {
441                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
442                         if (fake_audio_frame.data == nullptr) {
443                                 // Oh well, it's just a no-signal frame anyway.
444                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
445                                 current_video_frame.owner->release_frame(current_video_frame);
446                                 current_video_frame = video_frame_allocator->alloc_frame();
447                                 return;
448                         }
449                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
450                 }
451                 //dump_frame();
452                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
453
454                 // Update the assumed frame width. We might be one frame too late on format changes,
455                 // but it's much better than asking the user to choose manually.
456                 VideoFormat video_format;
457                 if (decode_video_format(format, &video_format)) {
458                         assumed_frame_width = video_format.width;
459                 }
460         }
461         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
462         //      format, timecode,
463         //      //start[7], start[6], start[5], start[4],
464         //      read_current_frame, FRAME_SIZE);
465
466         current_video_frame = video_frame_allocator->alloc_frame();
467         //if (current_video_frame.data == nullptr) {
468         //      read_current_frame = -1;
469         //} else {
470         //      read_current_frame = 0;
471         //}
472 }
473
474 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
475 {
476         uint16_t format = (start[3] << 8) | start[2];
477         uint16_t timecode = (start[1] << 8) | start[0];
478         if (current_audio_frame.len > 0) {
479                 current_audio_frame.received_timestamp = steady_clock::now();
480                 //dump_audio_block();
481                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
482         }
483         //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
484         //      format, timecode);
485         current_audio_frame = audio_frame_allocator->alloc_frame();
486 }
487
488 #if 0
489 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
490 {
491         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
492         for (unsigned j = 0; j < pack->actual_length; j++) {
493         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
494                 printf("%02x", xfr->buffer[j + offset]);
495                 if ((j % 16) == 15)
496                         printf("\n");
497                 else if ((j % 8) == 7)
498                         printf("  ");
499                 else
500                         printf(" ");
501         }
502 }
503 #endif
504
505 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
506 {
507         assert(n % 2 == 0);
508         uint8_t *dptr1 = dest1;
509         uint8_t *dptr2 = dest2;
510
511         for (size_t i = 0; i < n; i += 2) {
512                 *dptr1++ = *src++;
513                 *dptr2++ = *src++;
514         }
515 }
516
517 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
518 {
519         if (current_frame->data == nullptr ||
520             current_frame->len > current_frame->size ||
521             start == end) {
522                 return;
523         }
524
525         int bytes = end - start;
526         if (current_frame->len + bytes > current_frame->size) {
527                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
528                 current_frame->len = current_frame->size;
529                 if (current_frame->overflow > 1048576) {
530                         printf("%d bytes overflow after last %s frame\n",
531                                 int(current_frame->overflow), frame_type_name);
532                         current_frame->overflow = 0;
533                 }
534                 //dump_frame();
535         } else {
536                 if (current_frame->interleaved) {
537                         uint8_t *data = current_frame->data + current_frame->len / 2;
538                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
539                         if (current_frame->len % 2 == 1) {
540                                 ++data;
541                                 swap(data, data2);
542                         }
543                         if (bytes % 2 == 1) {
544                                 *data++ = *start++;
545                                 swap(data, data2);
546                                 ++current_frame->len;
547                                 --bytes;
548                         }
549                         memcpy_interleaved(data, data2, start, bytes);
550                         current_frame->len += bytes;
551                 } else {
552                         memcpy(current_frame->data + current_frame->len, start, bytes);
553                         current_frame->len += bytes;
554                 }
555         }
556 }
557
558 #if 0
559 void avx2_dump(const char *name, __m256i n)
560 {
561         printf("%-10s:", name);
562         printf(" %02x", _mm256_extract_epi8(n, 0));
563         printf(" %02x", _mm256_extract_epi8(n, 1));
564         printf(" %02x", _mm256_extract_epi8(n, 2));
565         printf(" %02x", _mm256_extract_epi8(n, 3));
566         printf(" %02x", _mm256_extract_epi8(n, 4));
567         printf(" %02x", _mm256_extract_epi8(n, 5));
568         printf(" %02x", _mm256_extract_epi8(n, 6));
569         printf(" %02x", _mm256_extract_epi8(n, 7));
570         printf(" ");
571         printf(" %02x", _mm256_extract_epi8(n, 8));
572         printf(" %02x", _mm256_extract_epi8(n, 9));
573         printf(" %02x", _mm256_extract_epi8(n, 10));
574         printf(" %02x", _mm256_extract_epi8(n, 11));
575         printf(" %02x", _mm256_extract_epi8(n, 12));
576         printf(" %02x", _mm256_extract_epi8(n, 13));
577         printf(" %02x", _mm256_extract_epi8(n, 14));
578         printf(" %02x", _mm256_extract_epi8(n, 15));
579         printf(" ");
580         printf(" %02x", _mm256_extract_epi8(n, 16));
581         printf(" %02x", _mm256_extract_epi8(n, 17));
582         printf(" %02x", _mm256_extract_epi8(n, 18));
583         printf(" %02x", _mm256_extract_epi8(n, 19));
584         printf(" %02x", _mm256_extract_epi8(n, 20));
585         printf(" %02x", _mm256_extract_epi8(n, 21));
586         printf(" %02x", _mm256_extract_epi8(n, 22));
587         printf(" %02x", _mm256_extract_epi8(n, 23));
588         printf(" ");
589         printf(" %02x", _mm256_extract_epi8(n, 24));
590         printf(" %02x", _mm256_extract_epi8(n, 25));
591         printf(" %02x", _mm256_extract_epi8(n, 26));
592         printf(" %02x", _mm256_extract_epi8(n, 27));
593         printf(" %02x", _mm256_extract_epi8(n, 28));
594         printf(" %02x", _mm256_extract_epi8(n, 29));
595         printf(" %02x", _mm256_extract_epi8(n, 30));
596         printf(" %02x", _mm256_extract_epi8(n, 31));
597         printf("\n");
598 }
599 #endif
600
601 #ifndef HAS_MULTIVERSIONING
602
603 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
604 {
605         // No fast path possible unless we have multiversioning.
606         return start;
607 }
608
609 #else  // defined(HAS_MULTIVERSIONING)
610
611 __attribute__((target("sse4.1")))
612 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
613
614 __attribute__((target("avx2")))
615 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
616
617 // Does a memcpy and memchr in one to reduce processing time.
618 // Note that the benefit is somewhat limited if your L3 cache is small,
619 // as you'll (unfortunately) spend most of the time loading the data
620 // from main memory.
621 //
622 // Complicated cases are left to the slow path; it basically stops copying
623 // up until the first instance of "sync_char" (usually a bit before, actually).
624 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
625 // data, and what we really need this for is the 00 00 ff ff marker in video data.
626 __attribute__((target("default")))
627 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
628 {
629         // No fast path possible unless we have SSE 4.1 or higher.
630         return start;
631 }
632
633 __attribute__((target("sse4.1", "avx2")))
634 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
635 {
636         if (current_frame->data == nullptr ||
637             current_frame->len > current_frame->size ||
638             start == limit) {
639                 return start;
640         }
641         size_t orig_bytes = limit - start;
642         if (orig_bytes < 128) {
643                 // Don't bother.
644                 return start;
645         }
646
647         // Don't read more bytes than we can write.
648         limit = min(limit, start + (current_frame->size - current_frame->len));
649
650         // Align end to 32 bytes.
651         limit = (const uint8_t *)(intptr_t(limit) & ~31);
652
653         if (start >= limit) {
654                 return start;
655         }
656
657         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
658         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
659         if (aligned_start != start) {
660                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
661                 if (sync_start == nullptr) {
662                         add_to_frame(current_frame, "", start, aligned_start);
663                 } else {
664                         add_to_frame(current_frame, "", start, sync_start);
665                         return sync_start;
666                 }
667         }
668
669         // Make the length a multiple of 64.
670         if (current_frame->interleaved) {
671                 if (((limit - aligned_start) % 64) != 0) {
672                         limit -= 32;
673                 }
674                 assert(((limit - aligned_start) % 64) == 0);
675         }
676
677         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
678 }
679
680 __attribute__((target("avx2")))
681 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
682 {
683         const __m256i needle = _mm256_set1_epi8(sync_char);
684
685         const __restrict __m256i *in = (const __m256i *)aligned_start;
686         if (current_frame->interleaved) {
687                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
688                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
689                 if (current_frame->len % 2 == 1) {
690                         swap(out1, out2);
691                 }
692
693                 __m256i shuffle_cw = _mm256_set_epi8(
694                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
695                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
696                 while (in < (const __m256i *)limit) {
697                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
698                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
699                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
700
701                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
702                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
703                         __m256i found = _mm256_or_si256(found1, found2);
704
705                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
706                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
707                 
708                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
709                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
710
711                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
712                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
713
714                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
715                         _mm256_storeu_si256(out2, hi);
716
717                         if (!_mm256_testz_si256(found, found)) {
718                                 break;
719                         }
720
721                         in += 2;
722                         ++out1;
723                         ++out2;
724                 }
725                 current_frame->len += (uint8_t *)in - aligned_start;
726         } else {
727                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
728                 while (in < (const __m256i *)limit) {
729                         __m256i data = _mm256_load_si256(in);
730                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
731                         __m256i found = _mm256_cmpeq_epi8(data, needle);
732                         if (!_mm256_testz_si256(found, found)) {
733                                 break;
734                         }
735
736                         ++in;
737                         ++out;
738                 }
739                 current_frame->len = (uint8_t *)out - current_frame->data;
740         }
741
742         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
743         return (const uint8_t *)in;
744 }
745
746 __attribute__((target("sse4.1")))
747 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
748 {
749         const __m128i needle = _mm_set1_epi8(sync_char);
750
751         const __m128i *in = (const __m128i *)aligned_start;
752         if (current_frame->interleaved) {
753                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
754                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
755                 if (current_frame->len % 2 == 1) {
756                         swap(out1, out2);
757                 }
758
759                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
760                 while (in < (const __m128i *)limit) {
761                         __m128i data1 = _mm_load_si128(in);
762                         __m128i data2 = _mm_load_si128(in + 1);
763                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
764                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
765                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
766                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
767                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
768                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
769                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
770                         _mm_storeu_si128(out2, hi);
771                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
772                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
773                         if (!_mm_testz_si128(found1, found1) ||
774                             !_mm_testz_si128(found2, found2)) {
775                                 break;
776                         }
777
778                         in += 2;
779                         ++out1;
780                         ++out2;
781                 }
782                 current_frame->len += (uint8_t *)in - aligned_start;
783         } else {
784                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
785                 while (in < (const __m128i *)limit) {
786                         __m128i data = _mm_load_si128(in);
787                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
788                         __m128i found = _mm_cmpeq_epi8(data, needle);
789                         if (!_mm_testz_si128(found, found)) {
790                                 break;
791                         }
792
793                         ++in;
794                         ++out;
795                 }
796                 current_frame->len = (uint8_t *)out - current_frame->data;
797         }
798
799         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
800         return (const uint8_t *)in;
801 }
802
803 #endif  // defined(HAS_MULTIVERSIONING)
804
805 void decode_packs(const libusb_transfer *xfr,
806                   const char *sync_pattern,
807                   int sync_length,
808                   FrameAllocator::Frame *current_frame,
809                   const char *frame_type_name,
810                   function<void(const uint8_t *start)> start_callback)
811 {
812         int offset = 0;
813         for (int i = 0; i < xfr->num_iso_packets; i++) {
814                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
815
816                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
817                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
818                         continue;
819 //exit(5);
820                 }
821
822                 const uint8_t *start = xfr->buffer + offset;
823                 const uint8_t *limit = start + pack->actual_length;
824                 while (start < limit) {  // Usually runs only one iteration.
825                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
826                         if (start == limit) break;
827                         assert(start < limit);
828
829                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
830                         if (start_next_frame == nullptr) {
831                                 // add the rest of the buffer
832                                 add_to_frame(current_frame, frame_type_name, start, limit);
833                                 break;
834                         } else {
835                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
836                                 start = start_next_frame + sync_length;  // skip sync
837                                 start_callback(start);
838                         }
839                 }
840 #if 0
841                 dump_pack(xfr, offset, pack);
842 #endif
843                 offset += pack->length;
844         }
845 }
846
847 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
848 {
849         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
850             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
851                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
852                 libusb_free_transfer(xfr);
853                 exit(3);
854         }
855
856         assert(xfr->user_data != nullptr);
857         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
858
859         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
860                 if (!usb->disconnected) {
861                         fprintf(stderr, "Device went away, stopping transfers.\n");
862                         usb->disconnected = true;
863                         if (usb->card_disconnected_callback) {
864                                 usb->card_disconnected_callback();
865                         }
866                 }
867                 // Don't reschedule the transfer; the loop will stop by itself.
868                 return;
869         }
870
871         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
872                 if (xfr->endpoint == 0x84) {
873                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
874                 } else {
875                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
876
877                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
878                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
879                 }
880         }
881         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
882                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
883                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
884 #if 0
885                 if (setup->wIndex == 44) {
886                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
887                 } else {
888                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
889                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
890                 }
891 #else
892                 memcpy(usb->register_file + usb->current_register, buf, 4);
893                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
894                 if (usb->current_register == 0) {
895                         // read through all of them
896                         printf("register dump:");
897                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
898                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
899                         }
900                         printf("\n");
901                 }
902                 libusb_fill_control_setup(xfr->buffer,
903                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
904                         /*index=*/usb->current_register, /*length=*/4);
905 #endif
906         }
907
908 #if 0
909         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
910         for (i = 0; i < xfr->actual_length; i++) {
911                 printf("%02x", xfr->buffer[i]);
912                 if (i % 16)
913                         printf("\n");
914                 else if (i % 8)
915                         printf("  ");
916                 else
917                         printf(" ");
918         }
919 #endif
920
921         int rc = libusb_submit_transfer(xfr);
922         if (rc < 0) {
923                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
924                 exit(1);
925         }
926 }
927
928 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
929 {
930         if (card_connected_callback != nullptr) {
931                 libusb_device_descriptor desc;
932                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
933                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
934                         libusb_unref_device(dev);
935                         return 1;
936                 }
937
938                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
939                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
940                         card_connected_callback(dev);  // Callback takes ownership.
941                         return 0;
942                 }
943         }
944         libusb_unref_device(dev);
945         return 0;
946 }
947
948 void BMUSBCapture::usb_thread_func()
949 {
950         sched_param param;
951         memset(&param, 0, sizeof(param));
952         param.sched_priority = 1;
953         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
954                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
955         }
956         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
957         while (!should_quit) {
958                 timeval sec { 1, 0 };
959                 int rc = libusb_handle_events_timeout(nullptr, &sec);
960                 if (rc != LIBUSB_SUCCESS)
961                         break;
962         }
963 }
964
965 namespace {
966
967 struct USBCardDevice {
968         uint16_t product;
969         uint8_t bus, port;
970         libusb_device *device;
971 };
972
973 const char *get_product_name(uint16_t product)
974 {
975         if (product == 0xbd3b) {
976                 return "Intensity Shuttle";
977         } else if (product == 0xbd4f) {
978                 return "UltraStudio SDI";
979         } else {
980                 assert(false);
981                 return nullptr;
982         }
983 }
984
985 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
986 {
987         const char *product_name = get_product_name(product);
988
989         char buf[256];
990         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
991                 id, bus, port, product_name);
992         return buf;
993 }
994
995 vector<USBCardDevice> find_all_cards()
996 {
997         libusb_device **devices;
998         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
999         if (num_devices == -1) {
1000                 fprintf(stderr, "Error finding USB devices\n");
1001                 exit(1);
1002         }
1003         vector<USBCardDevice> found_cards;
1004         for (ssize_t i = 0; i < num_devices; ++i) {
1005                 libusb_device_descriptor desc;
1006                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
1007                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
1008                         exit(1);
1009                 }
1010
1011                 uint8_t bus = libusb_get_bus_number(devices[i]);
1012                 uint8_t port = libusb_get_port_number(devices[i]);
1013
1014                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
1015                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
1016                         libusb_unref_device(devices[i]);
1017                         continue;
1018                 }
1019
1020                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
1021         }
1022         libusb_free_device_list(devices, 0);
1023
1024         // Sort the devices to get a consistent ordering.
1025         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
1026                 if (a.product != b.product)
1027                         return a.product < b.product;
1028                 if (a.bus != b.bus)
1029                         return a.bus < b.bus;
1030                 return a.port < b.port;
1031         });
1032
1033         return found_cards;
1034 }
1035
1036 libusb_device_handle *open_card(int card_index, string *description)
1037 {
1038         vector<USBCardDevice> found_cards = find_all_cards();
1039
1040         for (size_t i = 0; i < found_cards.size(); ++i) {
1041                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1042                 fprintf(stderr, "%s\n", tmp_description.c_str());
1043                 if (i == size_t(card_index)) {
1044                         *description = tmp_description;
1045                 }
1046         }
1047
1048         if (size_t(card_index) >= found_cards.size()) {
1049                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1050                 exit(1);
1051         }
1052
1053         libusb_device_handle *devh;
1054         int rc = libusb_open(found_cards[card_index].device, &devh);
1055         if (rc < 0) {
1056                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1057                 exit(1);
1058         }
1059
1060         for (size_t i = 0; i < found_cards.size(); ++i) {
1061                 libusb_unref_device(found_cards[i].device);
1062         }
1063
1064         return devh;
1065 }
1066
1067 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1068 {
1069         uint8_t bus = libusb_get_bus_number(dev);
1070         uint8_t port = libusb_get_port_number(dev);
1071
1072         libusb_device_descriptor desc;
1073         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1074                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1075                 exit(1);
1076         }
1077
1078         *description = get_card_description(card_index, bus, port, desc.idProduct);
1079
1080         libusb_device_handle *devh;
1081         int rc = libusb_open(dev, &devh);
1082         if (rc < 0) {
1083                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1084                 exit(1);
1085         }
1086
1087         return devh;
1088 }
1089
1090 }  // namespace
1091
1092 unsigned BMUSBCapture::num_cards()
1093 {
1094         int rc = libusb_init(nullptr);
1095         if (rc < 0) {
1096                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1097                 exit(1);
1098         }
1099
1100         vector<USBCardDevice> found_cards = find_all_cards();
1101         unsigned ret = found_cards.size();
1102         for (size_t i = 0; i < found_cards.size(); ++i) {
1103                 libusb_unref_device(found_cards[i].device);
1104         }
1105         return ret;
1106 }
1107
1108 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1109 {
1110         current_pixel_format = pixel_format;
1111         update_capture_mode();
1112 }
1113
1114 void BMUSBCapture::configure_card()
1115 {
1116         if (video_frame_allocator == nullptr) {
1117                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1118                 set_video_frame_allocator(owned_video_frame_allocator.get());
1119         }
1120         if (audio_frame_allocator == nullptr) {
1121                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1122                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1123         }
1124         dequeue_thread_should_quit = false;
1125         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1126
1127         int rc;
1128         struct libusb_transfer *xfr;
1129
1130         rc = libusb_init(nullptr);
1131         if (rc < 0) {
1132                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1133                 exit(1);
1134         }
1135
1136         if (dev == nullptr) {
1137                 devh = open_card(card_index, &description);
1138         } else {
1139                 devh = open_card(card_index, dev, &description);
1140                 libusb_unref_device(dev);
1141         }
1142         if (!devh) {
1143                 fprintf(stderr, "Error finding USB device\n");
1144                 exit(1);
1145         }
1146
1147         libusb_config_descriptor *config;
1148         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1149         if (rc < 0) {
1150                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1151                 exit(1);
1152         }
1153
1154 #if 0
1155         printf("%d interface\n", config->bNumInterfaces);
1156         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1157                 printf("  interface %d\n", interface_number);
1158                 const libusb_interface *interface = &config->interface[interface_number];
1159                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1160                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1161                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1162                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1163                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1164                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1165                         }
1166                 }
1167         }
1168 #endif
1169
1170         rc = libusb_set_configuration(devh, /*configuration=*/1);
1171         if (rc < 0) {
1172                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1173                 exit(1);
1174         }
1175
1176         rc = libusb_claim_interface(devh, 0);
1177         if (rc < 0) {
1178                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1179                 exit(1);
1180         }
1181
1182         // Alternate setting 1 is output, alternate setting 2 is input.
1183         // Card is reset when switching alternates, so the driver uses
1184         // this “double switch” when it wants to reset.
1185         //
1186         // There's also alternate settings 3 and 4, which seem to be
1187         // like 1 and 2 except they advertise less bandwidth needed.
1188         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1189         if (rc < 0) {
1190                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1191                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1192                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1193                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1194                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1195                 }
1196                 exit(1);
1197         }
1198         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1199         if (rc < 0) {
1200                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1201                 exit(1);
1202         }
1203 #if 0
1204         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1205         if (rc < 0) {
1206                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1207                 exit(1);
1208         }
1209 #endif
1210
1211 #if 0
1212         rc = libusb_claim_interface(devh, 3);
1213         if (rc < 0) {
1214                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1215                 exit(1);
1216         }
1217 #endif
1218
1219         // theories:
1220         //   44 is some kind of timer register (first 16 bits count upwards)
1221         //   24 is some sort of watchdog?
1222         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1223         //      (or will go to 0x73c60010?), also seen 0x73c60100
1224         //   12 also changes all the time, unclear why  
1225         //   16 seems to be autodetected mode somehow
1226         //      --    this is e00115e0 after reset?
1227         //                    ed0115e0 after mode change [to output?]
1228         //                    2d0015e0 after more mode change [to input]
1229         //                    ed0115e0 after more mode change
1230         //                    2d0015e0 after more mode change
1231         //
1232         //                    390115e0 seems to indicate we have signal
1233         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1234         //
1235         //                    200015e0 on startup
1236         //         changes to 250115e0 when we sync to the signal
1237         //
1238         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1239         //
1240         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1241         //
1242         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1243         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1244         //
1245         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1246         //    perhaps some of them are related to analog output?
1247         //
1248         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1249         //    but the driver sets it to 0x8036802a at some point.
1250         //
1251         //    all of this is on request 214/215. other requests (192, 219,
1252         //    222, 223, 224) are used for firmware upgrade. Probably best to
1253         //    stay out of it unless you know what you're doing.
1254         //
1255         //
1256         // register 16:
1257         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1258         //
1259         // theories:
1260         //   0x01 - stable signal
1261         //   0x04 - deep color
1262         //   0x08 - unknown (audio??)
1263         //   0x20 - 720p??
1264         //   0x30 - 576p??
1265
1266         update_capture_mode();
1267
1268         struct ctrl {
1269                 int endpoint;
1270                 int request;
1271                 int index;
1272                 uint32_t data;
1273         };
1274         static const ctrl ctrls[] = {
1275                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1276                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1277
1278                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1279                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1280                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1281                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1282         };
1283
1284         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1285                 uint32_t flipped = htonl(ctrls[req].data);
1286                 static uint8_t value[4];
1287                 memcpy(value, &flipped, sizeof(flipped));
1288                 int size = sizeof(value);
1289                 //if (ctrls[req].request == 215) size = 0;
1290                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1291                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1292                 if (rc < 0) {
1293                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1294                         exit(1);
1295                 }
1296
1297                 if (ctrls[req].index == 16 && rc == 4) {
1298                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1299                 }
1300
1301 #if 0
1302                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1303                 for (int i = 0; i < rc; ++i) {
1304                         printf("%02x", value[i]);
1305                 }
1306                 printf("\n");
1307 #endif
1308         }
1309
1310 #if 0
1311         // DEBUG
1312         for ( ;; ) {
1313                 static int my_index = 0;
1314                 static uint8_t value[4];
1315                 int size = sizeof(value);
1316                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1317                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1318                 if (rc < 0) {
1319                         fprintf(stderr, "Error on control\n");
1320                         exit(1);
1321                 }
1322                 printf("rc=%d index=%d: 0x", rc, my_index);
1323                 for (int i = 0; i < rc; ++i) {
1324                         printf("%02x", value[i]);
1325                 }
1326                 printf("\n");
1327         }
1328 #endif
1329
1330 #if 0
1331         // set up an asynchronous transfer of the timer register
1332         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1333         static int completed = 0;
1334
1335         xfr = libusb_alloc_transfer(0);
1336         libusb_fill_control_setup(cmdbuf,
1337             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1338                 /*index=*/44, /*length=*/4);
1339         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1340         xfr->user_data = this;
1341         libusb_submit_transfer(xfr);
1342
1343         // set up an asynchronous transfer of register 24
1344         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1345         static int completed2 = 0;
1346
1347         xfr = libusb_alloc_transfer(0);
1348         libusb_fill_control_setup(cmdbuf2,
1349             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1350                 /*index=*/24, /*length=*/4);
1351         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1352         xfr->user_data = this;
1353         libusb_submit_transfer(xfr);
1354 #endif
1355
1356         // set up an asynchronous transfer of the register dump
1357         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1358         static int completed3 = 0;
1359
1360         xfr = libusb_alloc_transfer(0);
1361         libusb_fill_control_setup(cmdbuf3,
1362             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1363                 /*index=*/current_register, /*length=*/4);
1364         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1365         xfr->user_data = this;
1366         //libusb_submit_transfer(xfr);
1367
1368         //audiofp = fopen("audio.raw", "wb");
1369
1370         // set up isochronous transfers for audio and video
1371         for (int e = 3; e <= 4; ++e) {
1372                 int num_transfers = 6;
1373                 for (int i = 0; i < num_transfers; ++i) {
1374                         size_t buf_size;
1375                         int num_iso_pack, size;
1376                         if (e == 3) {
1377                                 // Allocate for minimum width (because that will give us the most
1378                                 // number of packets, so we don't need to reallocate, but we'll
1379                                 // default to 720p for the first frame.
1380                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1381                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1382                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1383                         } else {
1384                                 size = 0xc0;
1385                                 num_iso_pack = 80;
1386                                 buf_size = num_iso_pack * size;
1387                         }
1388                         int num_bytes = num_iso_pack * size;
1389                         assert(size_t(num_bytes) <= buf_size);
1390 #if LIBUSB_API_VERSION >= 0x01000105
1391                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1392 #else
1393                         uint8_t *buf = nullptr;
1394 #endif
1395                         if (buf == nullptr) {
1396                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1397 #if LIBUSB_API_VERSION >= 0x01000105
1398                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1399 #else
1400                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1401 #endif
1402                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1403                                 buf = new uint8_t[num_bytes];
1404                         }
1405
1406                         xfr = libusb_alloc_transfer(num_iso_pack);
1407                         if (!xfr) {
1408                                 fprintf(stderr, "oom\n");
1409                                 exit(1);
1410                         }
1411
1412                         int ep = LIBUSB_ENDPOINT_IN | e;
1413                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1414                                 num_iso_pack, cb_xfr, nullptr, 0);
1415                         libusb_set_iso_packet_lengths(xfr, size);
1416                         xfr->user_data = this;
1417
1418                         if (e == 3) {
1419                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1420                         }
1421
1422                         iso_xfrs.push_back(xfr);
1423                 }
1424         }
1425 }
1426
1427 void BMUSBCapture::start_bm_capture()
1428 {
1429         int i = 0;
1430         for (libusb_transfer *xfr : iso_xfrs) {
1431                 int rc = libusb_submit_transfer(xfr);
1432                 ++i;
1433                 if (rc < 0) {
1434                         //printf("num_bytes=%d\n", num_bytes);
1435                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1436                                 xfr->endpoint, i, libusb_error_name(rc));
1437                         exit(1);
1438                 }
1439         }
1440
1441
1442 #if 0
1443         libusb_release_interface(devh, 0);
1444 out:
1445         if (devh)
1446                 libusb_close(devh);
1447         libusb_exit(nullptr);
1448         return rc;
1449 #endif
1450 }
1451
1452 void BMUSBCapture::stop_dequeue_thread()
1453 {
1454         dequeue_thread_should_quit = true;
1455         queues_not_empty.notify_all();
1456         dequeue_thread.join();
1457 }
1458
1459 void BMUSBCapture::start_bm_thread()
1460 {
1461         // Devices leaving are discovered by seeing the isochronous packets
1462         // coming back with errors, so only care about devices joining.
1463         if (card_connected_callback != nullptr) {
1464                 if (libusb_hotplug_register_callback(
1465                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1466                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1467                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1468                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1469                         exit(1);
1470                 }
1471         }
1472
1473         should_quit = false;
1474         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1475 }
1476
1477 void BMUSBCapture::stop_bm_thread()
1478 {
1479         should_quit = true;
1480         usb_thread.join();
1481 }
1482
1483 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1484 {
1485         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1486         VideoMode auto_mode;
1487         auto_mode.name = "Autodetect";
1488         auto_mode.autodetect = true;
1489         return {{ 0, auto_mode }};
1490 }
1491
1492 uint32_t BMUSBCapture::get_current_video_mode() const
1493 {
1494         return 0;  // Matches get_available_video_modes().
1495 }
1496
1497 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1498 {
1499         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1500 }
1501
1502 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1503 {
1504         return {
1505                 { 0x00000000, "HDMI/SDI" },
1506                 { 0x02000000, "Component" },
1507                 { 0x04000000, "Composite" },
1508                 { 0x06000000, "S-video" }
1509         };
1510 }
1511
1512 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1513 {
1514         assert((video_input_id & ~0x06000000) == 0);
1515         current_video_input = video_input_id;
1516         update_capture_mode();
1517 }
1518
1519 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1520 {
1521         return {
1522                 { 0x00000000, "Embedded" },
1523                 { 0x10000000, "Analog" }
1524         };
1525 }
1526
1527 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1528 {
1529         assert((audio_input_id & ~0x10000000) == 0);
1530         current_audio_input = audio_input_id;
1531         update_capture_mode();
1532 }
1533
1534 void BMUSBCapture::update_capture_mode()
1535 {
1536         if (devh == nullptr) {
1537                 return;
1538         }
1539
1540         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1541         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1542         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1543                 mode |= htonl(0x20000000);
1544         } else {
1545                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1546         }
1547
1548         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1549                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1550         if (rc < 0) {
1551                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1552                 exit(1);
1553         }
1554 }
1555
1556 }  // namespace bmusb