]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Add an RGBA pixel type.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.6.0
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
203                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
204                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
205                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
206                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
207                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
208                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
209                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
210                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
211                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
212                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
213                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
214                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
215         };
216         for (const VideoFormatEntry &entry : entries) {
217                 if (normalized_video_format == entry.normalized_video_format) {
218                         decoded_video_format->width = entry.width;
219                         decoded_video_format->height = entry.height;
220                         if (video_format & 0x0800) {
221                                 decoded_video_format->stride = entry.width * 2;
222                         } else {
223                                 decoded_video_format->stride = v210_stride(entry.width);
224                         }
225                         decoded_video_format->second_field_start = entry.second_field_start;
226                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
227                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
228                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
229                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
230                         decoded_video_format->interlaced = entry.interlaced;
231                         return true;
232                 }
233         }
234
235         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
236         decoded_video_format->width = 1280;
237         decoded_video_format->height = 720;
238         decoded_video_format->stride = 1280 * 2;
239         decoded_video_format->frame_rate_nom = 60;
240         decoded_video_format->frame_rate_den = 1;
241         return false;
242 }
243
244 }  // namespace
245
246 FrameAllocator::~FrameAllocator() {}
247
248 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
249         : frame_size(frame_size)
250 {
251         for (size_t i = 0; i < num_queued_frames; ++i) {
252                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
253         }
254 }
255
256 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
257 {
258         Frame vf;
259         vf.owner = this;
260
261         unique_lock<mutex> lock(freelist_mutex);  // Meh.
262         if (freelist.empty()) {
263                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
264                         frame_size);
265         } else {
266                 vf.data = freelist.top().release();
267                 vf.size = frame_size;
268                 freelist.pop();  // Meh.
269         }
270         return vf;
271 }
272
273 void MallocFrameAllocator::release_frame(Frame frame)
274 {
275         if (frame.overflow > 0) {
276                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
277         }
278         unique_lock<mutex> lock(freelist_mutex);
279         freelist.push(unique_ptr<uint8_t[]>(frame.data));
280 }
281
282 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
283 {
284         if (a == b) {
285                 return false;
286         } else if (a < b) {
287                 return (b - a < 0x8000);
288         } else {
289                 int wrap_b = 0x10000 + int(b);
290                 return (wrap_b - a < 0x8000);
291         }
292 }
293
294 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
295 {
296         unique_lock<mutex> lock(queue_lock);
297         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
298                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
299                         q->back().timecode, timecode);
300                 frame.owner->release_frame(frame);
301                 return;
302         }
303
304         QueuedFrame qf;
305         qf.format = format;
306         qf.timecode = timecode;
307         qf.frame = frame;
308         q->push_back(move(qf));
309         queues_not_empty.notify_one();  // might be spurious
310 }
311
312 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
313 {
314         FILE *fp = fopen(filename, "wb");
315         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
316                 printf("short write!\n");
317         }
318         fclose(fp);
319 }
320
321 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
322 {
323         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
324 }
325
326 void BMUSBCapture::dequeue_thread_func()
327 {
328         char thread_name[16];
329         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
330         pthread_setname_np(pthread_self(), thread_name);
331
332         if (has_dequeue_callbacks) {
333                 dequeue_init_callback();
334         }
335         while (!dequeue_thread_should_quit) {
336                 unique_lock<mutex> lock(queue_lock);
337                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
338
339                 if (dequeue_thread_should_quit) break;
340
341                 uint16_t video_timecode = pending_video_frames.front().timecode;
342                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
343                 AudioFormat audio_format;
344                 audio_format.bits_per_sample = 24;
345                 audio_format.num_channels = 8;
346                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
347                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
348                                 video_timecode);
349                         QueuedFrame video_frame = pending_video_frames.front();
350                         pending_video_frames.pop_front();
351                         lock.unlock();
352                         video_frame_allocator->release_frame(video_frame.frame);
353                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
354                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
355                                 audio_timecode);
356                         QueuedFrame audio_frame = pending_audio_frames.front();
357                         pending_audio_frames.pop_front();
358                         lock.unlock();
359                         audio_format.id = audio_frame.format;
360
361                         // Use the video format of the pending frame.
362                         QueuedFrame video_frame = pending_video_frames.front();
363                         VideoFormat video_format;
364                         decode_video_format(video_frame.format, &video_format);
365
366                         frame_callback(audio_timecode,
367                                        FrameAllocator::Frame(), 0, video_format,
368                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
369                 } else {
370                         QueuedFrame video_frame = pending_video_frames.front();
371                         QueuedFrame audio_frame = pending_audio_frames.front();
372                         pending_audio_frames.pop_front();
373                         pending_video_frames.pop_front();
374                         lock.unlock();
375
376 #if 0
377                         char filename[255];
378                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
379                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
380                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
381 #endif
382
383                         VideoFormat video_format;
384                         audio_format.id = audio_frame.format;
385                         if (decode_video_format(video_frame.format, &video_format)) {
386                                 frame_callback(video_timecode,
387                                                video_frame.frame, HEADER_SIZE, video_format,
388                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
389                         } else {
390                                 frame_callback(video_timecode,
391                                                FrameAllocator::Frame(), 0, video_format,
392                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
393                         }
394                 }
395         }
396         if (has_dequeue_callbacks) {
397                 dequeue_cleanup_callback();
398         }
399 }
400
401 void BMUSBCapture::start_new_frame(const uint8_t *start)
402 {
403         uint16_t format = (start[3] << 8) | start[2];
404         uint16_t timecode = (start[1] << 8) | start[0];
405
406         if (current_video_frame.len > 0) {
407                 current_video_frame.received_timestamp = steady_clock::now();
408
409                 // If format is 0x0800 (no signal), add a fake (empty) audio
410                 // frame to get it out of the queue.
411                 // TODO: Figure out if there are other formats that come with
412                 // no audio, and treat them the same.
413                 if (format == 0x0800) {
414                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
415                         if (fake_audio_frame.data == nullptr) {
416                                 // Oh well, it's just a no-signal frame anyway.
417                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
418                                 current_video_frame.owner->release_frame(current_video_frame);
419                                 current_video_frame = video_frame_allocator->alloc_frame();
420                                 return;
421                         }
422                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
423                 }
424                 //dump_frame();
425                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
426
427                 // Update the assumed frame width. We might be one frame too late on format changes,
428                 // but it's much better than asking the user to choose manually.
429                 VideoFormat video_format;
430                 if (decode_video_format(format, &video_format)) {
431                         assumed_frame_width = video_format.width;
432                 }
433         }
434         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
435         //      format, timecode,
436         //      //start[7], start[6], start[5], start[4],
437         //      read_current_frame, FRAME_SIZE);
438
439         current_video_frame = video_frame_allocator->alloc_frame();
440         //if (current_video_frame.data == nullptr) {
441         //      read_current_frame = -1;
442         //} else {
443         //      read_current_frame = 0;
444         //}
445 }
446
447 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
448 {
449         uint16_t format = (start[3] << 8) | start[2];
450         uint16_t timecode = (start[1] << 8) | start[0];
451         if (current_audio_frame.len > 0) {
452                 current_audio_frame.received_timestamp = steady_clock::now();
453                 //dump_audio_block();
454                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
455         }
456         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
457         //      format, timecode, read_current_audio_block);
458         current_audio_frame = audio_frame_allocator->alloc_frame();
459 }
460
461 #if 0
462 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
463 {
464         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
465         for (unsigned j = 0; j < pack->actual_length; j++) {
466         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
467                 printf("%02x", xfr->buffer[j + offset]);
468                 if ((j % 16) == 15)
469                         printf("\n");
470                 else if ((j % 8) == 7)
471                         printf("  ");
472                 else
473                         printf(" ");
474         }
475 }
476 #endif
477
478 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
479 {
480         assert(n % 2 == 0);
481         uint8_t *dptr1 = dest1;
482         uint8_t *dptr2 = dest2;
483
484         for (size_t i = 0; i < n; i += 2) {
485                 *dptr1++ = *src++;
486                 *dptr2++ = *src++;
487         }
488 }
489
490 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
491 {
492         if (current_frame->data == nullptr ||
493             current_frame->len > current_frame->size ||
494             start == end) {
495                 return;
496         }
497
498         int bytes = end - start;
499         if (current_frame->len + bytes > current_frame->size) {
500                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
501                 current_frame->len = current_frame->size;
502                 if (current_frame->overflow > 1048576) {
503                         printf("%d bytes overflow after last %s frame\n",
504                                 int(current_frame->overflow), frame_type_name);
505                         current_frame->overflow = 0;
506                 }
507                 //dump_frame();
508         } else {
509                 if (current_frame->interleaved) {
510                         uint8_t *data = current_frame->data + current_frame->len / 2;
511                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
512                         if (current_frame->len % 2 == 1) {
513                                 ++data;
514                                 swap(data, data2);
515                         }
516                         if (bytes % 2 == 1) {
517                                 *data++ = *start++;
518                                 swap(data, data2);
519                                 ++current_frame->len;
520                                 --bytes;
521                         }
522                         memcpy_interleaved(data, data2, start, bytes);
523                         current_frame->len += bytes;
524                 } else {
525                         memcpy(current_frame->data + current_frame->len, start, bytes);
526                         current_frame->len += bytes;
527                 }
528         }
529 }
530
531 #if 0
532 void avx2_dump(const char *name, __m256i n)
533 {
534         printf("%-10s:", name);
535         printf(" %02x", _mm256_extract_epi8(n, 0));
536         printf(" %02x", _mm256_extract_epi8(n, 1));
537         printf(" %02x", _mm256_extract_epi8(n, 2));
538         printf(" %02x", _mm256_extract_epi8(n, 3));
539         printf(" %02x", _mm256_extract_epi8(n, 4));
540         printf(" %02x", _mm256_extract_epi8(n, 5));
541         printf(" %02x", _mm256_extract_epi8(n, 6));
542         printf(" %02x", _mm256_extract_epi8(n, 7));
543         printf(" ");
544         printf(" %02x", _mm256_extract_epi8(n, 8));
545         printf(" %02x", _mm256_extract_epi8(n, 9));
546         printf(" %02x", _mm256_extract_epi8(n, 10));
547         printf(" %02x", _mm256_extract_epi8(n, 11));
548         printf(" %02x", _mm256_extract_epi8(n, 12));
549         printf(" %02x", _mm256_extract_epi8(n, 13));
550         printf(" %02x", _mm256_extract_epi8(n, 14));
551         printf(" %02x", _mm256_extract_epi8(n, 15));
552         printf(" ");
553         printf(" %02x", _mm256_extract_epi8(n, 16));
554         printf(" %02x", _mm256_extract_epi8(n, 17));
555         printf(" %02x", _mm256_extract_epi8(n, 18));
556         printf(" %02x", _mm256_extract_epi8(n, 19));
557         printf(" %02x", _mm256_extract_epi8(n, 20));
558         printf(" %02x", _mm256_extract_epi8(n, 21));
559         printf(" %02x", _mm256_extract_epi8(n, 22));
560         printf(" %02x", _mm256_extract_epi8(n, 23));
561         printf(" ");
562         printf(" %02x", _mm256_extract_epi8(n, 24));
563         printf(" %02x", _mm256_extract_epi8(n, 25));
564         printf(" %02x", _mm256_extract_epi8(n, 26));
565         printf(" %02x", _mm256_extract_epi8(n, 27));
566         printf(" %02x", _mm256_extract_epi8(n, 28));
567         printf(" %02x", _mm256_extract_epi8(n, 29));
568         printf(" %02x", _mm256_extract_epi8(n, 30));
569         printf(" %02x", _mm256_extract_epi8(n, 31));
570         printf("\n");
571 }
572 #endif
573
574 #ifndef HAS_MULTIVERSIONING
575
576 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
577 {
578         // No fast path possible unless we have multiversioning.
579         return start;
580 }
581
582 #else  // defined(HAS_MULTIVERSIONING)
583
584 __attribute__((target("sse4.1")))
585 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
586
587 __attribute__((target("avx2")))
588 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
589
590 // Does a memcpy and memchr in one to reduce processing time.
591 // Note that the benefit is somewhat limited if your L3 cache is small,
592 // as you'll (unfortunately) spend most of the time loading the data
593 // from main memory.
594 //
595 // Complicated cases are left to the slow path; it basically stops copying
596 // up until the first instance of "sync_char" (usually a bit before, actually).
597 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
598 // data, and what we really need this for is the 00 00 ff ff marker in video data.
599 __attribute__((target("default")))
600 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
601 {
602         // No fast path possible unless we have SSE 4.1 or higher.
603         return start;
604 }
605
606 __attribute__((target("sse4.1", "avx2")))
607 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
608 {
609         if (current_frame->data == nullptr ||
610             current_frame->len > current_frame->size ||
611             start == limit) {
612                 return start;
613         }
614         size_t orig_bytes = limit - start;
615         if (orig_bytes < 128) {
616                 // Don't bother.
617                 return start;
618         }
619
620         // Don't read more bytes than we can write.
621         limit = min(limit, start + (current_frame->size - current_frame->len));
622
623         // Align end to 32 bytes.
624         limit = (const uint8_t *)(intptr_t(limit) & ~31);
625
626         if (start >= limit) {
627                 return start;
628         }
629
630         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
631         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
632         if (aligned_start != start) {
633                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
634                 if (sync_start == nullptr) {
635                         add_to_frame(current_frame, "", start, aligned_start);
636                 } else {
637                         add_to_frame(current_frame, "", start, sync_start);
638                         return sync_start;
639                 }
640         }
641
642         // Make the length a multiple of 64.
643         if (current_frame->interleaved) {
644                 if (((limit - aligned_start) % 64) != 0) {
645                         limit -= 32;
646                 }
647                 assert(((limit - aligned_start) % 64) == 0);
648         }
649
650         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
651 }
652
653 __attribute__((target("avx2")))
654 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
655 {
656         const __m256i needle = _mm256_set1_epi8(sync_char);
657
658         const __restrict __m256i *in = (const __m256i *)aligned_start;
659         if (current_frame->interleaved) {
660                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
661                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
662                 if (current_frame->len % 2 == 1) {
663                         swap(out1, out2);
664                 }
665
666                 __m256i shuffle_cw = _mm256_set_epi8(
667                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
668                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
669                 while (in < (const __m256i *)limit) {
670                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
671                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
672                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
673
674                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
675                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
676                         __m256i found = _mm256_or_si256(found1, found2);
677
678                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
679                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
680                 
681                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
682                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
683
684                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
685                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
686
687                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
688                         _mm256_storeu_si256(out2, hi);
689
690                         if (!_mm256_testz_si256(found, found)) {
691                                 break;
692                         }
693
694                         in += 2;
695                         ++out1;
696                         ++out2;
697                 }
698                 current_frame->len += (uint8_t *)in - aligned_start;
699         } else {
700                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
701                 while (in < (const __m256i *)limit) {
702                         __m256i data = _mm256_load_si256(in);
703                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
704                         __m256i found = _mm256_cmpeq_epi8(data, needle);
705                         if (!_mm256_testz_si256(found, found)) {
706                                 break;
707                         }
708
709                         ++in;
710                         ++out;
711                 }
712                 current_frame->len = (uint8_t *)out - current_frame->data;
713         }
714
715         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
716         return (const uint8_t *)in;
717 }
718
719 __attribute__((target("sse4.1")))
720 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
721 {
722         const __m128i needle = _mm_set1_epi8(sync_char);
723
724         const __m128i *in = (const __m128i *)aligned_start;
725         if (current_frame->interleaved) {
726                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
727                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
728                 if (current_frame->len % 2 == 1) {
729                         swap(out1, out2);
730                 }
731
732                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
733                 while (in < (const __m128i *)limit) {
734                         __m128i data1 = _mm_load_si128(in);
735                         __m128i data2 = _mm_load_si128(in + 1);
736                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
737                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
738                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
739                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
740                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
741                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
742                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
743                         _mm_storeu_si128(out2, hi);
744                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
745                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
746                         if (!_mm_testz_si128(found1, found1) ||
747                             !_mm_testz_si128(found2, found2)) {
748                                 break;
749                         }
750
751                         in += 2;
752                         ++out1;
753                         ++out2;
754                 }
755                 current_frame->len += (uint8_t *)in - aligned_start;
756         } else {
757                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
758                 while (in < (const __m128i *)limit) {
759                         __m128i data = _mm_load_si128(in);
760                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
761                         __m128i found = _mm_cmpeq_epi8(data, needle);
762                         if (!_mm_testz_si128(found, found)) {
763                                 break;
764                         }
765
766                         ++in;
767                         ++out;
768                 }
769                 current_frame->len = (uint8_t *)out - current_frame->data;
770         }
771
772         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
773         return (const uint8_t *)in;
774 }
775
776 #endif  // defined(HAS_MULTIVERSIONING)
777
778 void decode_packs(const libusb_transfer *xfr,
779                   const char *sync_pattern,
780                   int sync_length,
781                   FrameAllocator::Frame *current_frame,
782                   const char *frame_type_name,
783                   function<void(const uint8_t *start)> start_callback)
784 {
785         int offset = 0;
786         for (int i = 0; i < xfr->num_iso_packets; i++) {
787                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
788
789                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
790                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
791                         continue;
792 //exit(5);
793                 }
794
795                 const uint8_t *start = xfr->buffer + offset;
796                 const uint8_t *limit = start + pack->actual_length;
797                 while (start < limit) {  // Usually runs only one iteration.
798                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
799                         if (start == limit) break;
800                         assert(start < limit);
801
802                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
803                         if (start_next_frame == nullptr) {
804                                 // add the rest of the buffer
805                                 add_to_frame(current_frame, frame_type_name, start, limit);
806                                 break;
807                         } else {
808                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
809                                 start = start_next_frame + sync_length;  // skip sync
810                                 start_callback(start);
811                         }
812                 }
813 #if 0
814                 dump_pack(xfr, offset, pack);
815 #endif
816                 offset += pack->length;
817         }
818 }
819
820 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
821 {
822         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
823             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
824                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
825                 libusb_free_transfer(xfr);
826                 exit(3);
827         }
828
829         assert(xfr->user_data != nullptr);
830         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
831
832         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
833                 if (!usb->disconnected) {
834                         fprintf(stderr, "Device went away, stopping transfers.\n");
835                         usb->disconnected = true;
836                         if (usb->card_disconnected_callback) {
837                                 usb->card_disconnected_callback();
838                         }
839                 }
840                 // Don't reschedule the transfer; the loop will stop by itself.
841                 return;
842         }
843
844         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
845                 if (xfr->endpoint == 0x84) {
846                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
847                 } else {
848                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
849
850                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
851                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
852                 }
853         }
854         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
855                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
856                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
857 #if 0
858                 if (setup->wIndex == 44) {
859                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
860                 } else {
861                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
862                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
863                 }
864 #else
865                 memcpy(usb->register_file + usb->current_register, buf, 4);
866                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
867                 if (usb->current_register == 0) {
868                         // read through all of them
869                         printf("register dump:");
870                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
871                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
872                         }
873                         printf("\n");
874                 }
875                 libusb_fill_control_setup(xfr->buffer,
876                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
877                         /*index=*/usb->current_register, /*length=*/4);
878 #endif
879         }
880
881 #if 0
882         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
883         for (i = 0; i < xfr->actual_length; i++) {
884                 printf("%02x", xfr->buffer[i]);
885                 if (i % 16)
886                         printf("\n");
887                 else if (i % 8)
888                         printf("  ");
889                 else
890                         printf(" ");
891         }
892 #endif
893
894         int rc = libusb_submit_transfer(xfr);
895         if (rc < 0) {
896                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
897                 exit(1);
898         }
899 }
900
901 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
902 {
903         if (card_connected_callback != nullptr) {
904                 libusb_device_descriptor desc;
905                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
906                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
907                         libusb_unref_device(dev);
908                         return 1;
909                 }
910
911                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
912                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
913                         card_connected_callback(dev);  // Callback takes ownership.
914                         return 0;
915                 }
916         }
917         libusb_unref_device(dev);
918         return 0;
919 }
920
921 void BMUSBCapture::usb_thread_func()
922 {
923         sched_param param;
924         memset(&param, 0, sizeof(param));
925         param.sched_priority = 1;
926         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
927                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
928         }
929         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
930         while (!should_quit) {
931                 timeval sec { 1, 0 };
932                 int rc = libusb_handle_events_timeout(nullptr, &sec);
933                 if (rc != LIBUSB_SUCCESS)
934                         break;
935         }
936 }
937
938 namespace {
939
940 struct USBCardDevice {
941         uint16_t product;
942         uint8_t bus, port;
943         libusb_device *device;
944 };
945
946 const char *get_product_name(uint16_t product)
947 {
948         if (product == 0xbd3b) {
949                 return "Intensity Shuttle";
950         } else if (product == 0xbd4f) {
951                 return "UltraStudio SDI";
952         } else {
953                 assert(false);
954                 return nullptr;
955         }
956 }
957
958 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
959 {
960         const char *product_name = get_product_name(product);
961
962         char buf[256];
963         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
964                 id, bus, port, product_name);
965         return buf;
966 }
967
968 vector<USBCardDevice> find_all_cards()
969 {
970         libusb_device **devices;
971         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
972         if (num_devices == -1) {
973                 fprintf(stderr, "Error finding USB devices\n");
974                 exit(1);
975         }
976         vector<USBCardDevice> found_cards;
977         for (ssize_t i = 0; i < num_devices; ++i) {
978                 libusb_device_descriptor desc;
979                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
980                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
981                         exit(1);
982                 }
983
984                 uint8_t bus = libusb_get_bus_number(devices[i]);
985                 uint8_t port = libusb_get_port_number(devices[i]);
986
987                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
988                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
989                         libusb_unref_device(devices[i]);
990                         continue;
991                 }
992
993                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
994         }
995         libusb_free_device_list(devices, 0);
996
997         // Sort the devices to get a consistent ordering.
998         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
999                 if (a.product != b.product)
1000                         return a.product < b.product;
1001                 if (a.bus != b.bus)
1002                         return a.bus < b.bus;
1003                 return a.port < b.port;
1004         });
1005
1006         return found_cards;
1007 }
1008
1009 libusb_device_handle *open_card(int card_index, string *description)
1010 {
1011         vector<USBCardDevice> found_cards = find_all_cards();
1012
1013         for (size_t i = 0; i < found_cards.size(); ++i) {
1014                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1015                 fprintf(stderr, "%s\n", tmp_description.c_str());
1016                 if (i == size_t(card_index)) {
1017                         *description = tmp_description;
1018                 }
1019         }
1020
1021         if (size_t(card_index) >= found_cards.size()) {
1022                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1023                 exit(1);
1024         }
1025
1026         libusb_device_handle *devh;
1027         int rc = libusb_open(found_cards[card_index].device, &devh);
1028         if (rc < 0) {
1029                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1030                 exit(1);
1031         }
1032
1033         for (size_t i = 0; i < found_cards.size(); ++i) {
1034                 libusb_unref_device(found_cards[i].device);
1035         }
1036
1037         return devh;
1038 }
1039
1040 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1041 {
1042         uint8_t bus = libusb_get_bus_number(dev);
1043         uint8_t port = libusb_get_port_number(dev);
1044
1045         libusb_device_descriptor desc;
1046         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1047                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1048                 exit(1);
1049         }
1050
1051         *description = get_card_description(card_index, bus, port, desc.idProduct);
1052
1053         libusb_device_handle *devh;
1054         int rc = libusb_open(dev, &devh);
1055         if (rc < 0) {
1056                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1057                 exit(1);
1058         }
1059
1060         return devh;
1061 }
1062
1063 }  // namespace
1064
1065 unsigned BMUSBCapture::num_cards()
1066 {
1067         int rc = libusb_init(nullptr);
1068         if (rc < 0) {
1069                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1070                 exit(1);
1071         }
1072
1073         vector<USBCardDevice> found_cards = find_all_cards();
1074         unsigned ret = found_cards.size();
1075         for (size_t i = 0; i < found_cards.size(); ++i) {
1076                 libusb_unref_device(found_cards[i].device);
1077         }
1078         return ret;
1079 }
1080
1081 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1082 {
1083         current_pixel_format = pixel_format;
1084         update_capture_mode();
1085 }
1086
1087 void BMUSBCapture::configure_card()
1088 {
1089         if (video_frame_allocator == nullptr) {
1090                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1091                 set_video_frame_allocator(owned_video_frame_allocator.get());
1092         }
1093         if (audio_frame_allocator == nullptr) {
1094                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1095                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1096         }
1097         dequeue_thread_should_quit = false;
1098         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1099
1100         int rc;
1101         struct libusb_transfer *xfr;
1102
1103         rc = libusb_init(nullptr);
1104         if (rc < 0) {
1105                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1106                 exit(1);
1107         }
1108
1109         if (dev == nullptr) {
1110                 devh = open_card(card_index, &description);
1111         } else {
1112                 devh = open_card(card_index, dev, &description);
1113                 libusb_unref_device(dev);
1114         }
1115         if (!devh) {
1116                 fprintf(stderr, "Error finding USB device\n");
1117                 exit(1);
1118         }
1119
1120         libusb_config_descriptor *config;
1121         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1122         if (rc < 0) {
1123                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1124                 exit(1);
1125         }
1126
1127 #if 0
1128         printf("%d interface\n", config->bNumInterfaces);
1129         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1130                 printf("  interface %d\n", interface_number);
1131                 const libusb_interface *interface = &config->interface[interface_number];
1132                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1133                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1134                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1135                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1136                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1137                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1138                         }
1139                 }
1140         }
1141 #endif
1142
1143         rc = libusb_set_configuration(devh, /*configuration=*/1);
1144         if (rc < 0) {
1145                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1146                 exit(1);
1147         }
1148
1149         rc = libusb_claim_interface(devh, 0);
1150         if (rc < 0) {
1151                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1152                 exit(1);
1153         }
1154
1155         // Alternate setting 1 is output, alternate setting 2 is input.
1156         // Card is reset when switching alternates, so the driver uses
1157         // this “double switch” when it wants to reset.
1158         //
1159         // There's also alternate settings 3 and 4, which seem to be
1160         // like 1 and 2 except they advertise less bandwidth needed.
1161         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1162         if (rc < 0) {
1163                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1164                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1165                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1166                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1167                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1168                 }
1169                 exit(1);
1170         }
1171         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1172         if (rc < 0) {
1173                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1174                 exit(1);
1175         }
1176 #if 0
1177         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1178         if (rc < 0) {
1179                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1180                 exit(1);
1181         }
1182 #endif
1183
1184 #if 0
1185         rc = libusb_claim_interface(devh, 3);
1186         if (rc < 0) {
1187                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1188                 exit(1);
1189         }
1190 #endif
1191
1192         // theories:
1193         //   44 is some kind of timer register (first 16 bits count upwards)
1194         //   24 is some sort of watchdog?
1195         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1196         //      (or will go to 0x73c60010?), also seen 0x73c60100
1197         //   12 also changes all the time, unclear why  
1198         //   16 seems to be autodetected mode somehow
1199         //      --    this is e00115e0 after reset?
1200         //                    ed0115e0 after mode change [to output?]
1201         //                    2d0015e0 after more mode change [to input]
1202         //                    ed0115e0 after more mode change
1203         //                    2d0015e0 after more mode change
1204         //
1205         //                    390115e0 seems to indicate we have signal
1206         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1207         //
1208         //                    200015e0 on startup
1209         //         changes to 250115e0 when we sync to the signal
1210         //
1211         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1212         //
1213         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1214         //
1215         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1216         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1217         //
1218         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1219         //    perhaps some of them are related to analog output?
1220         //
1221         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1222         //    but the driver sets it to 0x8036802a at some point.
1223         //
1224         //    all of this is on request 214/215. other requests (192, 219,
1225         //    222, 223, 224) are used for firmware upgrade. Probably best to
1226         //    stay out of it unless you know what you're doing.
1227         //
1228         //
1229         // register 16:
1230         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1231         //
1232         // theories:
1233         //   0x01 - stable signal
1234         //   0x04 - deep color
1235         //   0x08 - unknown (audio??)
1236         //   0x20 - 720p??
1237         //   0x30 - 576p??
1238
1239         update_capture_mode();
1240
1241         struct ctrl {
1242                 int endpoint;
1243                 int request;
1244                 int index;
1245                 uint32_t data;
1246         };
1247         static const ctrl ctrls[] = {
1248                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1249                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1250
1251                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1252                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1253                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1254                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1255         };
1256
1257         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1258                 uint32_t flipped = htonl(ctrls[req].data);
1259                 static uint8_t value[4];
1260                 memcpy(value, &flipped, sizeof(flipped));
1261                 int size = sizeof(value);
1262                 //if (ctrls[req].request == 215) size = 0;
1263                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1264                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1265                 if (rc < 0) {
1266                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1267                         exit(1);
1268                 }
1269
1270                 if (ctrls[req].index == 16 && rc == 4) {
1271                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1272                 }
1273
1274 #if 0
1275                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1276                 for (int i = 0; i < rc; ++i) {
1277                         printf("%02x", value[i]);
1278                 }
1279                 printf("\n");
1280 #endif
1281         }
1282
1283 #if 0
1284         // DEBUG
1285         for ( ;; ) {
1286                 static int my_index = 0;
1287                 static uint8_t value[4];
1288                 int size = sizeof(value);
1289                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1290                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1291                 if (rc < 0) {
1292                         fprintf(stderr, "Error on control\n");
1293                         exit(1);
1294                 }
1295                 printf("rc=%d index=%d: 0x", rc, my_index);
1296                 for (int i = 0; i < rc; ++i) {
1297                         printf("%02x", value[i]);
1298                 }
1299                 printf("\n");
1300         }
1301 #endif
1302
1303 #if 0
1304         // set up an asynchronous transfer of the timer register
1305         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1306         static int completed = 0;
1307
1308         xfr = libusb_alloc_transfer(0);
1309         libusb_fill_control_setup(cmdbuf,
1310             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1311                 /*index=*/44, /*length=*/4);
1312         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1313         xfr->user_data = this;
1314         libusb_submit_transfer(xfr);
1315
1316         // set up an asynchronous transfer of register 24
1317         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1318         static int completed2 = 0;
1319
1320         xfr = libusb_alloc_transfer(0);
1321         libusb_fill_control_setup(cmdbuf2,
1322             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1323                 /*index=*/24, /*length=*/4);
1324         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1325         xfr->user_data = this;
1326         libusb_submit_transfer(xfr);
1327 #endif
1328
1329         // set up an asynchronous transfer of the register dump
1330         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1331         static int completed3 = 0;
1332
1333         xfr = libusb_alloc_transfer(0);
1334         libusb_fill_control_setup(cmdbuf3,
1335             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1336                 /*index=*/current_register, /*length=*/4);
1337         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1338         xfr->user_data = this;
1339         //libusb_submit_transfer(xfr);
1340
1341         //audiofp = fopen("audio.raw", "wb");
1342
1343         // set up isochronous transfers for audio and video
1344         for (int e = 3; e <= 4; ++e) {
1345                 int num_transfers = 6;
1346                 for (int i = 0; i < num_transfers; ++i) {
1347                         size_t buf_size;
1348                         int num_iso_pack, size;
1349                         if (e == 3) {
1350                                 // Allocate for minimum width (because that will give us the most
1351                                 // number of packets, so we don't need to reallocate, but we'll
1352                                 // default to 720p for the first frame.
1353                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1354                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1355                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1356                         } else {
1357                                 size = 0xc0;
1358                                 num_iso_pack = 80;
1359                                 buf_size = num_iso_pack * size;
1360                         }
1361                         int num_bytes = num_iso_pack * size;
1362                         assert(size_t(num_bytes) <= buf_size);
1363 #if LIBUSB_API_VERSION >= 0x01000105
1364                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1365 #else
1366                         uint8_t *buf = nullptr;
1367 #endif
1368                         if (buf == nullptr) {
1369                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1370 #if LIBUSB_API_VERSION >= 0x01000105
1371                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1372 #else
1373                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1374 #endif
1375                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1376                                 buf = new uint8_t[num_bytes];
1377                         }
1378
1379                         xfr = libusb_alloc_transfer(num_iso_pack);
1380                         if (!xfr) {
1381                                 fprintf(stderr, "oom\n");
1382                                 exit(1);
1383                         }
1384
1385                         int ep = LIBUSB_ENDPOINT_IN | e;
1386                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1387                                 num_iso_pack, cb_xfr, nullptr, 0);
1388                         libusb_set_iso_packet_lengths(xfr, size);
1389                         xfr->user_data = this;
1390
1391                         if (e == 3) {
1392                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1393                         }
1394
1395                         iso_xfrs.push_back(xfr);
1396                 }
1397         }
1398 }
1399
1400 void BMUSBCapture::start_bm_capture()
1401 {
1402         int i = 0;
1403         for (libusb_transfer *xfr : iso_xfrs) {
1404                 int rc = libusb_submit_transfer(xfr);
1405                 ++i;
1406                 if (rc < 0) {
1407                         //printf("num_bytes=%d\n", num_bytes);
1408                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1409                                 xfr->endpoint, i, libusb_error_name(rc));
1410                         exit(1);
1411                 }
1412         }
1413
1414
1415 #if 0
1416         libusb_release_interface(devh, 0);
1417 out:
1418         if (devh)
1419                 libusb_close(devh);
1420         libusb_exit(nullptr);
1421         return rc;
1422 #endif
1423 }
1424
1425 void BMUSBCapture::stop_dequeue_thread()
1426 {
1427         dequeue_thread_should_quit = true;
1428         queues_not_empty.notify_all();
1429         dequeue_thread.join();
1430 }
1431
1432 void BMUSBCapture::start_bm_thread()
1433 {
1434         // Devices leaving are discovered by seeing the isochronous packets
1435         // coming back with errors, so only care about devices joining.
1436         if (card_connected_callback != nullptr) {
1437                 if (libusb_hotplug_register_callback(
1438                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1439                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1440                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1441                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1442                         exit(1);
1443                 }
1444         }
1445
1446         should_quit = false;
1447         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1448 }
1449
1450 void BMUSBCapture::stop_bm_thread()
1451 {
1452         should_quit = true;
1453         usb_thread.join();
1454 }
1455
1456 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1457 {
1458         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1459         VideoMode auto_mode;
1460         auto_mode.name = "Autodetect";
1461         auto_mode.autodetect = true;
1462         return {{ 0, auto_mode }};
1463 }
1464
1465 uint32_t BMUSBCapture::get_current_video_mode() const
1466 {
1467         return 0;  // Matches get_available_video_modes().
1468 }
1469
1470 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1471 {
1472         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1473 }
1474
1475 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1476 {
1477         return {
1478                 { 0x00000000, "HDMI/SDI" },
1479                 { 0x02000000, "Component" },
1480                 { 0x04000000, "Composite" },
1481                 { 0x06000000, "S-video" }
1482         };
1483 }
1484
1485 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1486 {
1487         assert((video_input_id & ~0x06000000) == 0);
1488         current_video_input = video_input_id;
1489         update_capture_mode();
1490 }
1491
1492 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1493 {
1494         return {
1495                 { 0x00000000, "Embedded" },
1496                 { 0x10000000, "Analog" }
1497         };
1498 }
1499
1500 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1501 {
1502         assert((audio_input_id & ~0x10000000) == 0);
1503         current_audio_input = audio_input_id;
1504         update_capture_mode();
1505 }
1506
1507 void BMUSBCapture::update_capture_mode()
1508 {
1509         if (devh == nullptr) {
1510                 return;
1511         }
1512
1513         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1514         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1515         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1516                 mode |= htonl(0x20000000);
1517         } else {
1518                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1519         }
1520
1521         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1522                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1523         if (rc < 0) {
1524                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1525                 exit(1);
1526         }
1527 }
1528
1529 }  // namespace bmusb