]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Add another 720p50 mode we discovered.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.7.2
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0151,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
203                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
204                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
205                 { 0x0161, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
206                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
207                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
208                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
209                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
210                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
211                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
212                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
213                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
214                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
215                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
216                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
217         };
218         for (const VideoFormatEntry &entry : entries) {
219                 if (normalized_video_format == entry.normalized_video_format) {
220                         decoded_video_format->width = entry.width;
221                         decoded_video_format->height = entry.height;
222                         if (video_format & 0x0800) {
223                                 decoded_video_format->stride = entry.width * 2;
224                         } else {
225                                 decoded_video_format->stride = v210_stride(entry.width);
226                         }
227                         decoded_video_format->second_field_start = entry.second_field_start;
228                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
229                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
230                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
231                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
232                         decoded_video_format->interlaced = entry.interlaced;
233                         return true;
234                 }
235         }
236
237         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
238         decoded_video_format->width = 1280;
239         decoded_video_format->height = 720;
240         decoded_video_format->stride = 1280 * 2;
241         decoded_video_format->frame_rate_nom = 60;
242         decoded_video_format->frame_rate_den = 1;
243         return false;
244 }
245
246 // There are seemingly no direct indicators of sample rate; you just get
247 // one frame's worth and have to guess from that.
248 int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_rate)
249 {
250         size_t num_samples = len / 3 / 8;
251         size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
252
253         // See if we match or are very close to any of the mandatory HDMI sample rates.
254         const int candidate_sample_rates[] = { 32000, 44100, 48000 };
255         for (int rate : candidate_sample_rates) {
256                 if (abs(int(num_samples_per_second) - rate) <= 100) {
257                         return rate;
258                 }
259         }
260
261         fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, keeping capture at %d Hz\n",
262                 num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second, default_rate);
263         return default_rate;
264 }
265
266 }  // namespace
267
268 FrameAllocator::~FrameAllocator() {}
269
270 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
271         : frame_size(frame_size)
272 {
273         for (size_t i = 0; i < num_queued_frames; ++i) {
274                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
275         }
276 }
277
278 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
279 {
280         Frame vf;
281         vf.owner = this;
282
283         unique_lock<mutex> lock(freelist_mutex);  // Meh.
284         if (freelist.empty()) {
285                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
286                         frame_size);
287         } else {
288                 vf.data = freelist.top().release();
289                 vf.size = frame_size;
290                 freelist.pop();  // Meh.
291         }
292         return vf;
293 }
294
295 void MallocFrameAllocator::release_frame(Frame frame)
296 {
297         if (frame.overflow > 0) {
298                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
299         }
300         unique_lock<mutex> lock(freelist_mutex);
301         freelist.push(unique_ptr<uint8_t[]>(frame.data));
302 }
303
304 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
305 {
306         if (a == b) {
307                 return false;
308         } else if (a < b) {
309                 return (b - a < 0x8000);
310         } else {
311                 int wrap_b = 0x10000 + int(b);
312                 return (wrap_b - a < 0x8000);
313         }
314 }
315
316 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
317 {
318         unique_lock<mutex> lock(queue_lock);
319         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
320                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
321                         q->back().timecode, timecode);
322                 frame.owner->release_frame(frame);
323                 return;
324         }
325
326         QueuedFrame qf;
327         qf.format = format;
328         qf.timecode = timecode;
329         qf.frame = frame;
330         q->push_back(move(qf));
331         queues_not_empty.notify_one();  // might be spurious
332 }
333
334 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
335 {
336         FILE *fp = fopen(filename, "wb");
337         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
338                 printf("short write!\n");
339         }
340         fclose(fp);
341 }
342
343 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
344 {
345         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
346 }
347
348 void BMUSBCapture::dequeue_thread_func()
349 {
350         char thread_name[16];
351         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
352         pthread_setname_np(pthread_self(), thread_name);
353
354         if (has_dequeue_callbacks) {
355                 dequeue_init_callback();
356         }
357         size_t last_sample_rate = 48000;
358         while (!dequeue_thread_should_quit) {
359                 unique_lock<mutex> lock(queue_lock);
360                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
361
362                 if (dequeue_thread_should_quit) break;
363
364                 uint16_t video_timecode = pending_video_frames.front().timecode;
365                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
366                 AudioFormat audio_format;
367                 audio_format.bits_per_sample = 24;
368                 audio_format.num_channels = 8;
369                 audio_format.sample_rate = last_sample_rate;
370                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
371                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
372                                 video_timecode);
373                         QueuedFrame video_frame = pending_video_frames.front();
374                         pending_video_frames.pop_front();
375                         lock.unlock();
376                         video_frame_allocator->release_frame(video_frame.frame);
377                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
378                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
379                                 audio_timecode);
380                         QueuedFrame audio_frame = pending_audio_frames.front();
381                         pending_audio_frames.pop_front();
382                         lock.unlock();
383                         audio_format.id = audio_frame.format;
384
385                         // Use the video format of the pending frame.
386                         QueuedFrame video_frame = pending_video_frames.front();
387                         VideoFormat video_format;
388                         decode_video_format(video_frame.format, &video_format);
389
390                         frame_callback(audio_timecode,
391                                        FrameAllocator::Frame(), 0, video_format,
392                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
393                 } else {
394                         QueuedFrame video_frame = pending_video_frames.front();
395                         QueuedFrame audio_frame = pending_audio_frames.front();
396                         pending_audio_frames.pop_front();
397                         pending_video_frames.pop_front();
398                         lock.unlock();
399
400 #if 0
401                         char filename[255];
402                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
403                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
404                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
405 #endif
406
407                         VideoFormat video_format;
408                         audio_format.id = audio_frame.format;
409                         if (decode_video_format(video_frame.format, &video_format)) {
410                                 if (audio_frame.frame.len != 0) {
411                                         audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len, last_sample_rate);
412                                         last_sample_rate = audio_format.sample_rate;
413                                 }
414                                 frame_callback(video_timecode,
415                                                video_frame.frame, HEADER_SIZE, video_format,
416                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
417                         } else {
418                                 video_frame_allocator->release_frame(video_frame.frame);
419                                 audio_format.sample_rate = last_sample_rate;
420                                 frame_callback(video_timecode,
421                                                FrameAllocator::Frame(), 0, video_format,
422                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
423                         }
424                 }
425         }
426         if (has_dequeue_callbacks) {
427                 dequeue_cleanup_callback();
428         }
429 }
430
431 void BMUSBCapture::start_new_frame(const uint8_t *start)
432 {
433         uint16_t format = (start[3] << 8) | start[2];
434         uint16_t timecode = (start[1] << 8) | start[0];
435
436         if (current_video_frame.len > 0) {
437                 current_video_frame.received_timestamp = steady_clock::now();
438
439                 // If format is 0x0800 (no signal), add a fake (empty) audio
440                 // frame to get it out of the queue.
441                 // TODO: Figure out if there are other formats that come with
442                 // no audio, and treat them the same.
443                 if (format == 0x0800) {
444                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
445                         if (fake_audio_frame.data == nullptr) {
446                                 // Oh well, it's just a no-signal frame anyway.
447                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
448                                 current_video_frame.owner->release_frame(current_video_frame);
449                                 current_video_frame = video_frame_allocator->alloc_frame();
450                                 return;
451                         }
452                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
453                 }
454                 //dump_frame();
455                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
456
457                 // Update the assumed frame width. We might be one frame too late on format changes,
458                 // but it's much better than asking the user to choose manually.
459                 VideoFormat video_format;
460                 if (decode_video_format(format, &video_format)) {
461                         assumed_frame_width = video_format.width;
462                 }
463         }
464         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
465         //      format, timecode,
466         //      //start[7], start[6], start[5], start[4],
467         //      read_current_frame, FRAME_SIZE);
468
469         current_video_frame = video_frame_allocator->alloc_frame();
470         //if (current_video_frame.data == nullptr) {
471         //      read_current_frame = -1;
472         //} else {
473         //      read_current_frame = 0;
474         //}
475 }
476
477 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
478 {
479         uint16_t format = (start[3] << 8) | start[2];
480         uint16_t timecode = (start[1] << 8) | start[0];
481         if (current_audio_frame.len > 0) {
482                 current_audio_frame.received_timestamp = steady_clock::now();
483                 //dump_audio_block();
484                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
485         }
486         //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
487         //      format, timecode);
488         current_audio_frame = audio_frame_allocator->alloc_frame();
489 }
490
491 #if 0
492 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
493 {
494         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
495         for (unsigned j = 0; j < pack->actual_length; j++) {
496         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
497                 printf("%02x", xfr->buffer[j + offset]);
498                 if ((j % 16) == 15)
499                         printf("\n");
500                 else if ((j % 8) == 7)
501                         printf("  ");
502                 else
503                         printf(" ");
504         }
505 }
506 #endif
507
508 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
509 {
510         assert(n % 2 == 0);
511         uint8_t *dptr1 = dest1;
512         uint8_t *dptr2 = dest2;
513
514         for (size_t i = 0; i < n; i += 2) {
515                 *dptr1++ = *src++;
516                 *dptr2++ = *src++;
517         }
518 }
519
520 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
521 {
522         if (current_frame->data == nullptr ||
523             current_frame->len > current_frame->size ||
524             start == end) {
525                 return;
526         }
527
528         int bytes = end - start;
529         if (current_frame->len + bytes > current_frame->size) {
530                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
531                 current_frame->len = current_frame->size;
532                 if (current_frame->overflow > 1048576) {
533                         printf("%d bytes overflow after last %s frame\n",
534                                 int(current_frame->overflow), frame_type_name);
535                         current_frame->overflow = 0;
536                 }
537                 //dump_frame();
538         } else {
539                 if (current_frame->interleaved) {
540                         uint8_t *data = current_frame->data + current_frame->len / 2;
541                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
542                         if (current_frame->len % 2 == 1) {
543                                 ++data;
544                                 swap(data, data2);
545                         }
546                         if (bytes % 2 == 1) {
547                                 *data++ = *start++;
548                                 swap(data, data2);
549                                 ++current_frame->len;
550                                 --bytes;
551                         }
552                         memcpy_interleaved(data, data2, start, bytes);
553                         current_frame->len += bytes;
554                 } else {
555                         memcpy(current_frame->data + current_frame->len, start, bytes);
556                         current_frame->len += bytes;
557                 }
558         }
559 }
560
561 #if 0
562 void avx2_dump(const char *name, __m256i n)
563 {
564         printf("%-10s:", name);
565         printf(" %02x", _mm256_extract_epi8(n, 0));
566         printf(" %02x", _mm256_extract_epi8(n, 1));
567         printf(" %02x", _mm256_extract_epi8(n, 2));
568         printf(" %02x", _mm256_extract_epi8(n, 3));
569         printf(" %02x", _mm256_extract_epi8(n, 4));
570         printf(" %02x", _mm256_extract_epi8(n, 5));
571         printf(" %02x", _mm256_extract_epi8(n, 6));
572         printf(" %02x", _mm256_extract_epi8(n, 7));
573         printf(" ");
574         printf(" %02x", _mm256_extract_epi8(n, 8));
575         printf(" %02x", _mm256_extract_epi8(n, 9));
576         printf(" %02x", _mm256_extract_epi8(n, 10));
577         printf(" %02x", _mm256_extract_epi8(n, 11));
578         printf(" %02x", _mm256_extract_epi8(n, 12));
579         printf(" %02x", _mm256_extract_epi8(n, 13));
580         printf(" %02x", _mm256_extract_epi8(n, 14));
581         printf(" %02x", _mm256_extract_epi8(n, 15));
582         printf(" ");
583         printf(" %02x", _mm256_extract_epi8(n, 16));
584         printf(" %02x", _mm256_extract_epi8(n, 17));
585         printf(" %02x", _mm256_extract_epi8(n, 18));
586         printf(" %02x", _mm256_extract_epi8(n, 19));
587         printf(" %02x", _mm256_extract_epi8(n, 20));
588         printf(" %02x", _mm256_extract_epi8(n, 21));
589         printf(" %02x", _mm256_extract_epi8(n, 22));
590         printf(" %02x", _mm256_extract_epi8(n, 23));
591         printf(" ");
592         printf(" %02x", _mm256_extract_epi8(n, 24));
593         printf(" %02x", _mm256_extract_epi8(n, 25));
594         printf(" %02x", _mm256_extract_epi8(n, 26));
595         printf(" %02x", _mm256_extract_epi8(n, 27));
596         printf(" %02x", _mm256_extract_epi8(n, 28));
597         printf(" %02x", _mm256_extract_epi8(n, 29));
598         printf(" %02x", _mm256_extract_epi8(n, 30));
599         printf(" %02x", _mm256_extract_epi8(n, 31));
600         printf("\n");
601 }
602 #endif
603
604 #ifndef HAS_MULTIVERSIONING
605
606 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
607 {
608         // No fast path possible unless we have multiversioning.
609         return start;
610 }
611
612 #else  // defined(HAS_MULTIVERSIONING)
613
614 __attribute__((target("sse4.1")))
615 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
616
617 __attribute__((target("avx2")))
618 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
619
620 // Does a memcpy and memchr in one to reduce processing time.
621 // Note that the benefit is somewhat limited if your L3 cache is small,
622 // as you'll (unfortunately) spend most of the time loading the data
623 // from main memory.
624 //
625 // Complicated cases are left to the slow path; it basically stops copying
626 // up until the first instance of "sync_char" (usually a bit before, actually).
627 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
628 // data, and what we really need this for is the 00 00 ff ff marker in video data.
629 __attribute__((target("default")))
630 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
631 {
632         // No fast path possible unless we have SSE 4.1 or higher.
633         return start;
634 }
635
636 __attribute__((target("sse4.1", "avx2")))
637 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
638 {
639         if (current_frame->data == nullptr ||
640             current_frame->len > current_frame->size ||
641             start == limit) {
642                 return start;
643         }
644         size_t orig_bytes = limit - start;
645         if (orig_bytes < 128) {
646                 // Don't bother.
647                 return start;
648         }
649
650         // Don't read more bytes than we can write.
651         limit = min(limit, start + (current_frame->size - current_frame->len));
652
653         // Align end to 32 bytes.
654         limit = (const uint8_t *)(intptr_t(limit) & ~31);
655
656         if (start >= limit) {
657                 return start;
658         }
659
660         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
661         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
662         if (aligned_start != start) {
663                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
664                 if (sync_start == nullptr) {
665                         add_to_frame(current_frame, "", start, aligned_start);
666                 } else {
667                         add_to_frame(current_frame, "", start, sync_start);
668                         return sync_start;
669                 }
670         }
671
672         // Make the length a multiple of 64.
673         if (current_frame->interleaved) {
674                 if (((limit - aligned_start) % 64) != 0) {
675                         limit -= 32;
676                 }
677                 assert(((limit - aligned_start) % 64) == 0);
678         }
679
680         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
681 }
682
683 __attribute__((target("avx2")))
684 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
685 {
686         const __m256i needle = _mm256_set1_epi8(sync_char);
687
688         const __restrict __m256i *in = (const __m256i *)aligned_start;
689         if (current_frame->interleaved) {
690                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
691                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
692                 if (current_frame->len % 2 == 1) {
693                         swap(out1, out2);
694                 }
695
696                 __m256i shuffle_cw = _mm256_set_epi8(
697                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
698                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
699                 while (in < (const __m256i *)limit) {
700                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
701                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
702                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
703
704                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
705                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
706                         __m256i found = _mm256_or_si256(found1, found2);
707
708                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
709                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
710                 
711                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
712                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
713
714                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
715                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
716
717                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
718                         _mm256_storeu_si256(out2, hi);
719
720                         if (!_mm256_testz_si256(found, found)) {
721                                 break;
722                         }
723
724                         in += 2;
725                         ++out1;
726                         ++out2;
727                 }
728                 current_frame->len += (uint8_t *)in - aligned_start;
729         } else {
730                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
731                 while (in < (const __m256i *)limit) {
732                         __m256i data = _mm256_load_si256(in);
733                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
734                         __m256i found = _mm256_cmpeq_epi8(data, needle);
735                         if (!_mm256_testz_si256(found, found)) {
736                                 break;
737                         }
738
739                         ++in;
740                         ++out;
741                 }
742                 current_frame->len = (uint8_t *)out - current_frame->data;
743         }
744
745         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
746         return (const uint8_t *)in;
747 }
748
749 __attribute__((target("sse4.1")))
750 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
751 {
752         const __m128i needle = _mm_set1_epi8(sync_char);
753
754         const __m128i *in = (const __m128i *)aligned_start;
755         if (current_frame->interleaved) {
756                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
757                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
758                 if (current_frame->len % 2 == 1) {
759                         swap(out1, out2);
760                 }
761
762                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
763                 while (in < (const __m128i *)limit) {
764                         __m128i data1 = _mm_load_si128(in);
765                         __m128i data2 = _mm_load_si128(in + 1);
766                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
767                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
768                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
769                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
770                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
771                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
772                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
773                         _mm_storeu_si128(out2, hi);
774                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
775                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
776                         if (!_mm_testz_si128(found1, found1) ||
777                             !_mm_testz_si128(found2, found2)) {
778                                 break;
779                         }
780
781                         in += 2;
782                         ++out1;
783                         ++out2;
784                 }
785                 current_frame->len += (uint8_t *)in - aligned_start;
786         } else {
787                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
788                 while (in < (const __m128i *)limit) {
789                         __m128i data = _mm_load_si128(in);
790                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
791                         __m128i found = _mm_cmpeq_epi8(data, needle);
792                         if (!_mm_testz_si128(found, found)) {
793                                 break;
794                         }
795
796                         ++in;
797                         ++out;
798                 }
799                 current_frame->len = (uint8_t *)out - current_frame->data;
800         }
801
802         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
803         return (const uint8_t *)in;
804 }
805
806 #endif  // defined(HAS_MULTIVERSIONING)
807
808 void decode_packs(const libusb_transfer *xfr,
809                   const char *sync_pattern,
810                   int sync_length,
811                   FrameAllocator::Frame *current_frame,
812                   const char *frame_type_name,
813                   function<void(const uint8_t *start)> start_callback)
814 {
815         int offset = 0;
816         for (int i = 0; i < xfr->num_iso_packets; i++) {
817                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
818
819                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
820                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
821                         continue;
822 //exit(5);
823                 }
824
825                 const uint8_t *start = xfr->buffer + offset;
826                 const uint8_t *limit = start + pack->actual_length;
827                 while (start < limit) {  // Usually runs only one iteration.
828                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
829                         if (start == limit) break;
830                         assert(start < limit);
831
832                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
833                         if (start_next_frame == nullptr) {
834                                 // add the rest of the buffer
835                                 add_to_frame(current_frame, frame_type_name, start, limit);
836                                 break;
837                         } else {
838                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
839                                 start = start_next_frame + sync_length;  // skip sync
840                                 start_callback(start);
841                         }
842                 }
843 #if 0
844                 dump_pack(xfr, offset, pack);
845 #endif
846                 offset += pack->length;
847         }
848 }
849
850 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
851 {
852         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
853             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
854                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
855                 libusb_free_transfer(xfr);
856                 exit(3);
857         }
858
859         assert(xfr->user_data != nullptr);
860         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
861
862         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
863                 if (!usb->disconnected) {
864                         fprintf(stderr, "Device went away, stopping transfers.\n");
865                         usb->disconnected = true;
866                         if (usb->card_disconnected_callback) {
867                                 usb->card_disconnected_callback();
868                         }
869                 }
870                 // Don't reschedule the transfer; the loop will stop by itself.
871                 return;
872         }
873
874         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
875                 if (xfr->endpoint == 0x84) {
876                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
877                 } else {
878                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
879
880                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
881                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
882                 }
883         }
884         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
885                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
886                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
887 #if 0
888                 if (setup->wIndex == 44) {
889                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
890                 } else {
891                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
892                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
893                 }
894 #else
895                 memcpy(usb->register_file + usb->current_register, buf, 4);
896                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
897                 if (usb->current_register == 0) {
898                         // read through all of them
899                         printf("register dump:");
900                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
901                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
902                         }
903                         printf("\n");
904                 }
905                 libusb_fill_control_setup(xfr->buffer,
906                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
907                         /*index=*/usb->current_register, /*length=*/4);
908 #endif
909         }
910
911 #if 0
912         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
913         for (i = 0; i < xfr->actual_length; i++) {
914                 printf("%02x", xfr->buffer[i]);
915                 if (i % 16)
916                         printf("\n");
917                 else if (i % 8)
918                         printf("  ");
919                 else
920                         printf(" ");
921         }
922 #endif
923
924         int rc = libusb_submit_transfer(xfr);
925         if (rc < 0) {
926                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
927                 exit(1);
928         }
929 }
930
931 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
932 {
933         if (card_connected_callback != nullptr) {
934                 libusb_device_descriptor desc;
935                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
936                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
937                         libusb_unref_device(dev);
938                         return 1;
939                 }
940
941                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
942                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
943                         card_connected_callback(dev);  // Callback takes ownership.
944                         return 0;
945                 }
946         }
947         libusb_unref_device(dev);
948         return 0;
949 }
950
951 void BMUSBCapture::usb_thread_func()
952 {
953         sched_param param;
954         memset(&param, 0, sizeof(param));
955         param.sched_priority = 1;
956         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
957                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
958         }
959         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
960         while (!should_quit) {
961                 timeval sec { 1, 0 };
962                 int rc = libusb_handle_events_timeout(nullptr, &sec);
963                 if (rc != LIBUSB_SUCCESS)
964                         break;
965         }
966 }
967
968 namespace {
969
970 struct USBCardDevice {
971         uint16_t product;
972         uint8_t bus, port;
973         libusb_device *device;
974 };
975
976 const char *get_product_name(uint16_t product)
977 {
978         if (product == 0xbd3b) {
979                 return "Intensity Shuttle";
980         } else if (product == 0xbd4f) {
981                 return "UltraStudio SDI";
982         } else {
983                 assert(false);
984                 return nullptr;
985         }
986 }
987
988 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
989 {
990         const char *product_name = get_product_name(product);
991
992         char buf[256];
993         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
994                 id, bus, port, product_name);
995         return buf;
996 }
997
998 vector<USBCardDevice> find_all_cards()
999 {
1000         libusb_device **devices;
1001         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
1002         if (num_devices == -1) {
1003                 fprintf(stderr, "Error finding USB devices\n");
1004                 exit(1);
1005         }
1006         vector<USBCardDevice> found_cards;
1007         for (ssize_t i = 0; i < num_devices; ++i) {
1008                 libusb_device_descriptor desc;
1009                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
1010                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
1011                         exit(1);
1012                 }
1013
1014                 uint8_t bus = libusb_get_bus_number(devices[i]);
1015                 uint8_t port = libusb_get_port_number(devices[i]);
1016
1017                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
1018                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
1019                         libusb_unref_device(devices[i]);
1020                         continue;
1021                 }
1022
1023                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
1024         }
1025         libusb_free_device_list(devices, 0);
1026
1027         // Sort the devices to get a consistent ordering.
1028         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
1029                 if (a.product != b.product)
1030                         return a.product < b.product;
1031                 if (a.bus != b.bus)
1032                         return a.bus < b.bus;
1033                 return a.port < b.port;
1034         });
1035
1036         return found_cards;
1037 }
1038
1039 libusb_device_handle *open_card(int card_index, string *description)
1040 {
1041         vector<USBCardDevice> found_cards = find_all_cards();
1042
1043         for (size_t i = 0; i < found_cards.size(); ++i) {
1044                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1045                 fprintf(stderr, "%s\n", tmp_description.c_str());
1046                 if (i == size_t(card_index)) {
1047                         *description = tmp_description;
1048                 }
1049         }
1050
1051         if (size_t(card_index) >= found_cards.size()) {
1052                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1053                 exit(1);
1054         }
1055
1056         libusb_device_handle *devh;
1057         int rc = libusb_open(found_cards[card_index].device, &devh);
1058         if (rc < 0) {
1059                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1060                 exit(1);
1061         }
1062
1063         for (size_t i = 0; i < found_cards.size(); ++i) {
1064                 libusb_unref_device(found_cards[i].device);
1065         }
1066
1067         return devh;
1068 }
1069
1070 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1071 {
1072         uint8_t bus = libusb_get_bus_number(dev);
1073         uint8_t port = libusb_get_port_number(dev);
1074
1075         libusb_device_descriptor desc;
1076         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1077                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1078                 exit(1);
1079         }
1080
1081         *description = get_card_description(card_index, bus, port, desc.idProduct);
1082
1083         libusb_device_handle *devh;
1084         int rc = libusb_open(dev, &devh);
1085         if (rc < 0) {
1086                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1087                 exit(1);
1088         }
1089
1090         return devh;
1091 }
1092
1093 }  // namespace
1094
1095 unsigned BMUSBCapture::num_cards()
1096 {
1097         int rc = libusb_init(nullptr);
1098         if (rc < 0) {
1099                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1100                 exit(1);
1101         }
1102
1103         vector<USBCardDevice> found_cards = find_all_cards();
1104         unsigned ret = found_cards.size();
1105         for (size_t i = 0; i < found_cards.size(); ++i) {
1106                 libusb_unref_device(found_cards[i].device);
1107         }
1108         return ret;
1109 }
1110
1111 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1112 {
1113         current_pixel_format = pixel_format;
1114         update_capture_mode();
1115 }
1116
1117 void BMUSBCapture::configure_card()
1118 {
1119         if (video_frame_allocator == nullptr) {
1120                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1121                 set_video_frame_allocator(owned_video_frame_allocator.get());
1122         }
1123         if (audio_frame_allocator == nullptr) {
1124                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1125                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1126         }
1127         dequeue_thread_should_quit = false;
1128         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1129
1130         int rc;
1131         struct libusb_transfer *xfr;
1132
1133         rc = libusb_init(nullptr);
1134         if (rc < 0) {
1135                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1136                 exit(1);
1137         }
1138
1139         if (dev == nullptr) {
1140                 devh = open_card(card_index, &description);
1141         } else {
1142                 devh = open_card(card_index, dev, &description);
1143                 libusb_unref_device(dev);
1144         }
1145         if (!devh) {
1146                 fprintf(stderr, "Error finding USB device\n");
1147                 exit(1);
1148         }
1149
1150         libusb_config_descriptor *config;
1151         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1152         if (rc < 0) {
1153                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1154                 exit(1);
1155         }
1156
1157 #if 0
1158         printf("%d interface\n", config->bNumInterfaces);
1159         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1160                 printf("  interface %d\n", interface_number);
1161                 const libusb_interface *interface = &config->interface[interface_number];
1162                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1163                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1164                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1165                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1166                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1167                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1168                         }
1169                 }
1170         }
1171 #endif
1172
1173         rc = libusb_set_configuration(devh, /*configuration=*/1);
1174         if (rc < 0) {
1175                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1176                 exit(1);
1177         }
1178
1179         rc = libusb_claim_interface(devh, 0);
1180         if (rc < 0) {
1181                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1182                 exit(1);
1183         }
1184
1185         // Alternate setting 1 is output, alternate setting 2 is input.
1186         // Card is reset when switching alternates, so the driver uses
1187         // this “double switch” when it wants to reset.
1188         //
1189         // There's also alternate settings 3 and 4, which seem to be
1190         // like 1 and 2 except they advertise less bandwidth needed.
1191         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1192         if (rc < 0) {
1193                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1194                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1195                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1196                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1197                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1198                 }
1199                 exit(1);
1200         }
1201         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1202         if (rc < 0) {
1203                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1204                 exit(1);
1205         }
1206 #if 0
1207         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1208         if (rc < 0) {
1209                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1210                 exit(1);
1211         }
1212 #endif
1213
1214 #if 0
1215         rc = libusb_claim_interface(devh, 3);
1216         if (rc < 0) {
1217                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1218                 exit(1);
1219         }
1220 #endif
1221
1222         // theories:
1223         //   44 is some kind of timer register (first 16 bits count upwards)
1224         //   24 is some sort of watchdog?
1225         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1226         //      (or will go to 0x73c60010?), also seen 0x73c60100
1227         //   12 also changes all the time, unclear why  
1228         //   16 seems to be autodetected mode somehow
1229         //      --    this is e00115e0 after reset?
1230         //                    ed0115e0 after mode change [to output?]
1231         //                    2d0015e0 after more mode change [to input]
1232         //                    ed0115e0 after more mode change
1233         //                    2d0015e0 after more mode change
1234         //
1235         //                    390115e0 seems to indicate we have signal
1236         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1237         //
1238         //                    200015e0 on startup
1239         //         changes to 250115e0 when we sync to the signal
1240         //
1241         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1242         //
1243         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1244         //
1245         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1246         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1247         //
1248         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1249         //    perhaps some of them are related to analog output?
1250         //
1251         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1252         //    but the driver sets it to 0x8036802a at some point.
1253         //
1254         //    all of this is on request 214/215. other requests (192, 219,
1255         //    222, 223, 224) are used for firmware upgrade. Probably best to
1256         //    stay out of it unless you know what you're doing.
1257         //
1258         //
1259         // register 16:
1260         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1261         //
1262         // theories:
1263         //   0x01 - stable signal
1264         //   0x04 - deep color
1265         //   0x08 - unknown (audio??)
1266         //   0x20 - 720p??
1267         //   0x30 - 576p??
1268
1269         update_capture_mode();
1270
1271         struct ctrl {
1272                 int endpoint;
1273                 int request;
1274                 int index;
1275                 uint32_t data;
1276         };
1277         static const ctrl ctrls[] = {
1278                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1279                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1280
1281                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1282                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1283                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1284                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1285         };
1286
1287         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1288                 uint32_t flipped = htonl(ctrls[req].data);
1289                 static uint8_t value[4];
1290                 memcpy(value, &flipped, sizeof(flipped));
1291                 int size = sizeof(value);
1292                 //if (ctrls[req].request == 215) size = 0;
1293                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1294                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1295                 if (rc < 0) {
1296                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1297                         exit(1);
1298                 }
1299
1300                 if (ctrls[req].index == 16 && rc == 4) {
1301                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1302                 }
1303
1304 #if 0
1305                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1306                 for (int i = 0; i < rc; ++i) {
1307                         printf("%02x", value[i]);
1308                 }
1309                 printf("\n");
1310 #endif
1311         }
1312
1313 #if 0
1314         // DEBUG
1315         for ( ;; ) {
1316                 static int my_index = 0;
1317                 static uint8_t value[4];
1318                 int size = sizeof(value);
1319                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1320                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1321                 if (rc < 0) {
1322                         fprintf(stderr, "Error on control\n");
1323                         exit(1);
1324                 }
1325                 printf("rc=%d index=%d: 0x", rc, my_index);
1326                 for (int i = 0; i < rc; ++i) {
1327                         printf("%02x", value[i]);
1328                 }
1329                 printf("\n");
1330         }
1331 #endif
1332
1333 #if 0
1334         // set up an asynchronous transfer of the timer register
1335         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1336         static int completed = 0;
1337
1338         xfr = libusb_alloc_transfer(0);
1339         libusb_fill_control_setup(cmdbuf,
1340             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1341                 /*index=*/44, /*length=*/4);
1342         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1343         xfr->user_data = this;
1344         libusb_submit_transfer(xfr);
1345
1346         // set up an asynchronous transfer of register 24
1347         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1348         static int completed2 = 0;
1349
1350         xfr = libusb_alloc_transfer(0);
1351         libusb_fill_control_setup(cmdbuf2,
1352             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1353                 /*index=*/24, /*length=*/4);
1354         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1355         xfr->user_data = this;
1356         libusb_submit_transfer(xfr);
1357 #endif
1358
1359         // set up an asynchronous transfer of the register dump
1360         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1361         static int completed3 = 0;
1362
1363         xfr = libusb_alloc_transfer(0);
1364         libusb_fill_control_setup(cmdbuf3,
1365             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1366                 /*index=*/current_register, /*length=*/4);
1367         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1368         xfr->user_data = this;
1369         //libusb_submit_transfer(xfr);
1370
1371         //audiofp = fopen("audio.raw", "wb");
1372
1373         // set up isochronous transfers for audio and video
1374         for (int e = 3; e <= 4; ++e) {
1375                 int num_transfers = 6;
1376                 for (int i = 0; i < num_transfers; ++i) {
1377                         size_t buf_size;
1378                         int num_iso_pack, size;
1379                         if (e == 3) {
1380                                 // Allocate for minimum width (because that will give us the most
1381                                 // number of packets, so we don't need to reallocate, but we'll
1382                                 // default to 720p for the first frame.
1383                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1384                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1385                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1386                         } else {
1387                                 size = 0xc0;
1388                                 num_iso_pack = 80;
1389                                 buf_size = num_iso_pack * size;
1390                         }
1391                         int num_bytes = num_iso_pack * size;
1392                         assert(size_t(num_bytes) <= buf_size);
1393 #if LIBUSB_API_VERSION >= 0x01000105
1394                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1395 #else
1396                         uint8_t *buf = nullptr;
1397 #endif
1398                         if (buf == nullptr) {
1399                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1400 #if LIBUSB_API_VERSION >= 0x01000105
1401                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1402 #else
1403                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1404 #endif
1405                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1406                                 buf = new uint8_t[num_bytes];
1407                         }
1408
1409                         xfr = libusb_alloc_transfer(num_iso_pack);
1410                         if (!xfr) {
1411                                 fprintf(stderr, "oom\n");
1412                                 exit(1);
1413                         }
1414
1415                         int ep = LIBUSB_ENDPOINT_IN | e;
1416                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1417                                 num_iso_pack, cb_xfr, nullptr, 0);
1418                         libusb_set_iso_packet_lengths(xfr, size);
1419                         xfr->user_data = this;
1420
1421                         if (e == 3) {
1422                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1423                         }
1424
1425                         iso_xfrs.push_back(xfr);
1426                 }
1427         }
1428 }
1429
1430 void BMUSBCapture::start_bm_capture()
1431 {
1432         int i = 0;
1433         for (libusb_transfer *xfr : iso_xfrs) {
1434                 int rc = libusb_submit_transfer(xfr);
1435                 ++i;
1436                 if (rc < 0) {
1437                         //printf("num_bytes=%d\n", num_bytes);
1438                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1439                                 xfr->endpoint, i, libusb_error_name(rc));
1440                         exit(1);
1441                 }
1442         }
1443
1444
1445 #if 0
1446         libusb_release_interface(devh, 0);
1447 out:
1448         if (devh)
1449                 libusb_close(devh);
1450         libusb_exit(nullptr);
1451         return rc;
1452 #endif
1453 }
1454
1455 void BMUSBCapture::stop_dequeue_thread()
1456 {
1457         dequeue_thread_should_quit = true;
1458         queues_not_empty.notify_all();
1459         dequeue_thread.join();
1460 }
1461
1462 void BMUSBCapture::start_bm_thread()
1463 {
1464         // Devices leaving are discovered by seeing the isochronous packets
1465         // coming back with errors, so only care about devices joining.
1466         if (card_connected_callback != nullptr) {
1467                 if (libusb_hotplug_register_callback(
1468                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1469                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1470                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1471                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1472                         exit(1);
1473                 }
1474         }
1475
1476         should_quit = false;
1477         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1478 }
1479
1480 void BMUSBCapture::stop_bm_thread()
1481 {
1482         should_quit = true;
1483         libusb_interrupt_event_handler(nullptr);
1484         usb_thread.join();
1485 }
1486
1487 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1488 {
1489         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1490         VideoMode auto_mode;
1491         auto_mode.name = "Autodetect";
1492         auto_mode.autodetect = true;
1493         return {{ 0, auto_mode }};
1494 }
1495
1496 uint32_t BMUSBCapture::get_current_video_mode() const
1497 {
1498         return 0;  // Matches get_available_video_modes().
1499 }
1500
1501 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1502 {
1503         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1504 }
1505
1506 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1507 {
1508         return {
1509                 { 0x00000000, "HDMI/SDI" },
1510                 { 0x02000000, "Component" },
1511                 { 0x04000000, "Composite" },
1512                 { 0x06000000, "S-video" }
1513         };
1514 }
1515
1516 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1517 {
1518         assert((video_input_id & ~0x06000000) == 0);
1519         current_video_input = video_input_id;
1520         update_capture_mode();
1521 }
1522
1523 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1524 {
1525         return {
1526                 { 0x00000000, "Embedded" },
1527                 { 0x10000000, "Analog" }
1528         };
1529 }
1530
1531 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1532 {
1533         assert((audio_input_id & ~0x10000000) == 0);
1534         current_audio_input = audio_input_id;
1535         update_capture_mode();
1536 }
1537
1538 void BMUSBCapture::update_capture_mode()
1539 {
1540         if (devh == nullptr) {
1541                 return;
1542         }
1543
1544         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1545         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1546         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1547                 mode |= htonl(0x20000000);
1548         } else {
1549                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1550         }
1551
1552         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1553                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1554         if (rc < 0) {
1555                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1556                 exit(1);
1557         }
1558 }
1559
1560 }  // namespace bmusb