]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Release 0.7.7 (no code changes, Makefile only).
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.7.4
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0151,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
203                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
204                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
205                 { 0x0161, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
206                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
207                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
208                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
209                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
210                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
211                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
212                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
213                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
214                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
215                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
216                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
217         };
218         for (const VideoFormatEntry &entry : entries) {
219                 if (normalized_video_format == entry.normalized_video_format) {
220                         decoded_video_format->width = entry.width;
221                         decoded_video_format->height = entry.height;
222                         if (video_format & 0x0800) {
223                                 decoded_video_format->stride = entry.width * 2;
224                         } else {
225                                 decoded_video_format->stride = v210_stride(entry.width);
226                         }
227                         decoded_video_format->second_field_start = entry.second_field_start;
228                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
229                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
230                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
231                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
232                         decoded_video_format->interlaced = entry.interlaced;
233                         return true;
234                 }
235         }
236
237         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
238         decoded_video_format->width = 1280;
239         decoded_video_format->height = 720;
240         decoded_video_format->stride = 1280 * 2;
241         decoded_video_format->frame_rate_nom = 60;
242         decoded_video_format->frame_rate_den = 1;
243         return false;
244 }
245
246 // There are seemingly no direct indicators of sample rate; you just get
247 // one frame's worth and have to guess from that.
248 int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_rate)
249 {
250         size_t num_samples = len / 3 / 8;
251         size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
252
253         // See if we match or are very close to any of the mandatory HDMI sample rates.
254         const int candidate_sample_rates[] = { 32000, 44100, 48000 };
255         for (int rate : candidate_sample_rates) {
256                 if (abs(int(num_samples_per_second) - rate) <= 100) {
257                         return rate;
258                 }
259         }
260
261         fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, keeping capture at %d Hz\n",
262                 num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second, default_rate);
263         return default_rate;
264 }
265
266 }  // namespace
267
268 FrameAllocator::~FrameAllocator() {}
269
270 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
271         : frame_size(frame_size)
272 {
273         for (size_t i = 0; i < num_queued_frames; ++i) {
274                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
275         }
276 }
277
278 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
279 {
280         Frame vf;
281         vf.owner = this;
282
283         unique_lock<mutex> lock(freelist_mutex);  // Meh.
284         if (freelist.empty()) {
285                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
286                         frame_size);
287         } else {
288                 vf.data = freelist.top().release();
289                 vf.size = frame_size;
290                 freelist.pop();  // Meh.
291         }
292         return vf;
293 }
294
295 void MallocFrameAllocator::release_frame(Frame frame)
296 {
297         if (frame.overflow > 0) {
298                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
299         }
300         unique_lock<mutex> lock(freelist_mutex);
301         freelist.push(unique_ptr<uint8_t[]>(frame.data));
302 }
303
304 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
305 {
306         if (a == b) {
307                 return false;
308         } else if (a < b) {
309                 return (b - a < 0x8000);
310         } else {
311                 int wrap_b = 0x10000 + int(b);
312                 return (wrap_b - a < 0x8000);
313         }
314 }
315
316 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
317 {
318         unique_lock<mutex> lock(queue_lock);
319         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
320                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
321                         q->back().timecode, timecode);
322                 frame.owner->release_frame(frame);
323                 return;
324         }
325
326         QueuedFrame qf;
327         qf.format = format;
328         qf.timecode = timecode;
329         qf.frame = frame;
330         q->push_back(move(qf));
331         queues_not_empty.notify_one();  // might be spurious
332 }
333
334 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
335 {
336         FILE *fp = fopen(filename, "wb");
337         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
338                 printf("short write!\n");
339         }
340         fclose(fp);
341 }
342
343 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
344 {
345         if (audiofp != nullptr) {
346                 fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
347         }
348 }
349
350 void BMUSBCapture::dequeue_thread_func()
351 {
352         char thread_name[16];
353         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
354         pthread_setname_np(pthread_self(), thread_name);
355
356         if (has_dequeue_callbacks) {
357                 dequeue_init_callback();
358         }
359         size_t last_sample_rate = 48000;
360         while (!dequeue_thread_should_quit) {
361                 unique_lock<mutex> lock(queue_lock);
362                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
363
364                 if (dequeue_thread_should_quit) break;
365
366                 uint16_t video_timecode = pending_video_frames.front().timecode;
367                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
368                 AudioFormat audio_format;
369                 audio_format.bits_per_sample = 24;
370                 audio_format.num_channels = 8;
371                 audio_format.sample_rate = last_sample_rate;
372                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
373                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
374                                 video_timecode);
375                         QueuedFrame video_frame = pending_video_frames.front();
376                         pending_video_frames.pop_front();
377                         lock.unlock();
378                         video_frame_allocator->release_frame(video_frame.frame);
379                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
380                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
381                                 audio_timecode);
382                         QueuedFrame audio_frame = pending_audio_frames.front();
383                         pending_audio_frames.pop_front();
384                         lock.unlock();
385                         audio_format.id = audio_frame.format;
386
387                         // Use the video format of the pending frame.
388                         QueuedFrame video_frame = pending_video_frames.front();
389                         VideoFormat video_format;
390                         decode_video_format(video_frame.format, &video_format);
391
392                         frame_callback(audio_timecode,
393                                        FrameAllocator::Frame(), 0, video_format,
394                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
395                 } else {
396                         QueuedFrame video_frame = pending_video_frames.front();
397                         QueuedFrame audio_frame = pending_audio_frames.front();
398                         pending_audio_frames.pop_front();
399                         pending_video_frames.pop_front();
400                         lock.unlock();
401
402 #if 0
403                         char filename[255];
404                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
405                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
406                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
407 #endif
408
409                         VideoFormat video_format;
410                         audio_format.id = audio_frame.format;
411                         if (decode_video_format(video_frame.format, &video_format)) {
412                                 if (audio_frame.frame.len != 0) {
413                                         audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len, last_sample_rate);
414                                         last_sample_rate = audio_format.sample_rate;
415                                 }
416                                 frame_callback(video_timecode,
417                                                video_frame.frame, HEADER_SIZE, video_format,
418                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
419                         } else {
420                                 video_frame_allocator->release_frame(video_frame.frame);
421                                 audio_format.sample_rate = last_sample_rate;
422                                 frame_callback(video_timecode,
423                                                FrameAllocator::Frame(), 0, video_format,
424                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
425                         }
426                 }
427         }
428         if (has_dequeue_callbacks) {
429                 dequeue_cleanup_callback();
430         }
431 }
432
433 void BMUSBCapture::start_new_frame(const uint8_t *start)
434 {
435         uint16_t format = (start[3] << 8) | start[2];
436         uint16_t timecode = (start[1] << 8) | start[0];
437
438         if (current_video_frame.len > 0) {
439                 current_video_frame.received_timestamp = steady_clock::now();
440
441                 // If format is 0x0800 (no signal), add a fake (empty) audio
442                 // frame to get it out of the queue.
443                 // TODO: Figure out if there are other formats that come with
444                 // no audio, and treat them the same.
445                 if (format == 0x0800) {
446                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
447                         if (fake_audio_frame.data == nullptr) {
448                                 // Oh well, it's just a no-signal frame anyway.
449                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
450                                 current_video_frame.owner->release_frame(current_video_frame);
451                                 current_video_frame = video_frame_allocator->alloc_frame();
452                                 return;
453                         }
454                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
455                 }
456                 //dump_frame();
457                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
458
459                 // Update the assumed frame width. We might be one frame too late on format changes,
460                 // but it's much better than asking the user to choose manually.
461                 VideoFormat video_format;
462                 if (decode_video_format(format, &video_format)) {
463                         assumed_frame_width = video_format.width;
464                 }
465         }
466         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
467         //      format, timecode,
468         //      //start[7], start[6], start[5], start[4],
469         //      read_current_frame, FRAME_SIZE);
470
471         current_video_frame = video_frame_allocator->alloc_frame();
472         //if (current_video_frame.data == nullptr) {
473         //      read_current_frame = -1;
474         //} else {
475         //      read_current_frame = 0;
476         //}
477 }
478
479 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
480 {
481         uint16_t format = (start[3] << 8) | start[2];
482         uint16_t timecode = (start[1] << 8) | start[0];
483         if (current_audio_frame.len > 0) {
484                 current_audio_frame.received_timestamp = steady_clock::now();
485                 //dump_audio_block();
486                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
487         }
488         //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
489         //      format, timecode);
490         current_audio_frame = audio_frame_allocator->alloc_frame();
491 }
492
493 #if 0
494 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
495 {
496         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
497         for (unsigned j = 0; j < pack->actual_length; j++) {
498         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
499                 printf("%02x", xfr->buffer[j + offset]);
500                 if ((j % 16) == 15)
501                         printf("\n");
502                 else if ((j % 8) == 7)
503                         printf("  ");
504                 else
505                         printf(" ");
506         }
507 }
508 #endif
509
510 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
511 {
512         assert(n % 2 == 0);
513         uint8_t *dptr1 = dest1;
514         uint8_t *dptr2 = dest2;
515
516         for (size_t i = 0; i < n; i += 2) {
517                 *dptr1++ = *src++;
518                 *dptr2++ = *src++;
519         }
520 }
521
522 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
523 {
524         if (current_frame->data == nullptr ||
525             current_frame->len > current_frame->size ||
526             start == end) {
527                 return;
528         }
529
530         int bytes = end - start;
531         if (current_frame->len + bytes > current_frame->size) {
532                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
533                 current_frame->len = current_frame->size;
534                 if (current_frame->overflow > 1048576) {
535                         printf("%d bytes overflow after last %s frame\n",
536                                 int(current_frame->overflow), frame_type_name);
537                         current_frame->overflow = 0;
538                 }
539                 //dump_frame();
540         } else {
541                 if (current_frame->data_copy != nullptr) {
542                         memcpy(current_frame->data_copy + current_frame->len, start, bytes);
543                 }
544                 if (current_frame->interleaved) {
545                         uint8_t *data = current_frame->data + current_frame->len / 2;
546                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
547                         if (current_frame->len % 2 == 1) {
548                                 ++data;
549                                 swap(data, data2);
550                         }
551                         if (bytes % 2 == 1) {
552                                 *data++ = *start++;
553                                 swap(data, data2);
554                                 ++current_frame->len;
555                                 --bytes;
556                         }
557                         memcpy_interleaved(data, data2, start, bytes);
558                         current_frame->len += bytes;
559                 } else {
560                         memcpy(current_frame->data + current_frame->len, start, bytes);
561                         current_frame->len += bytes;
562                 }
563         }
564 }
565
566 #if 0
567 void avx2_dump(const char *name, __m256i n)
568 {
569         printf("%-10s:", name);
570         printf(" %02x", _mm256_extract_epi8(n, 0));
571         printf(" %02x", _mm256_extract_epi8(n, 1));
572         printf(" %02x", _mm256_extract_epi8(n, 2));
573         printf(" %02x", _mm256_extract_epi8(n, 3));
574         printf(" %02x", _mm256_extract_epi8(n, 4));
575         printf(" %02x", _mm256_extract_epi8(n, 5));
576         printf(" %02x", _mm256_extract_epi8(n, 6));
577         printf(" %02x", _mm256_extract_epi8(n, 7));
578         printf(" ");
579         printf(" %02x", _mm256_extract_epi8(n, 8));
580         printf(" %02x", _mm256_extract_epi8(n, 9));
581         printf(" %02x", _mm256_extract_epi8(n, 10));
582         printf(" %02x", _mm256_extract_epi8(n, 11));
583         printf(" %02x", _mm256_extract_epi8(n, 12));
584         printf(" %02x", _mm256_extract_epi8(n, 13));
585         printf(" %02x", _mm256_extract_epi8(n, 14));
586         printf(" %02x", _mm256_extract_epi8(n, 15));
587         printf(" ");
588         printf(" %02x", _mm256_extract_epi8(n, 16));
589         printf(" %02x", _mm256_extract_epi8(n, 17));
590         printf(" %02x", _mm256_extract_epi8(n, 18));
591         printf(" %02x", _mm256_extract_epi8(n, 19));
592         printf(" %02x", _mm256_extract_epi8(n, 20));
593         printf(" %02x", _mm256_extract_epi8(n, 21));
594         printf(" %02x", _mm256_extract_epi8(n, 22));
595         printf(" %02x", _mm256_extract_epi8(n, 23));
596         printf(" ");
597         printf(" %02x", _mm256_extract_epi8(n, 24));
598         printf(" %02x", _mm256_extract_epi8(n, 25));
599         printf(" %02x", _mm256_extract_epi8(n, 26));
600         printf(" %02x", _mm256_extract_epi8(n, 27));
601         printf(" %02x", _mm256_extract_epi8(n, 28));
602         printf(" %02x", _mm256_extract_epi8(n, 29));
603         printf(" %02x", _mm256_extract_epi8(n, 30));
604         printf(" %02x", _mm256_extract_epi8(n, 31));
605         printf("\n");
606 }
607 #endif
608
609 #ifndef HAS_MULTIVERSIONING
610
611 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
612 {
613         // No fast path possible unless we have multiversioning.
614         return start;
615 }
616
617 #else  // defined(HAS_MULTIVERSIONING)
618
619 __attribute__((target("sse4.1")))
620 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
621
622 __attribute__((target("avx2")))
623 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
624
625 // Does a memcpy and memchr in one to reduce processing time.
626 // Note that the benefit is somewhat limited if your L3 cache is small,
627 // as you'll (unfortunately) spend most of the time loading the data
628 // from main memory.
629 //
630 // Complicated cases are left to the slow path; it basically stops copying
631 // up until the first instance of "sync_char" (usually a bit before, actually).
632 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
633 // data, and what we really need this for is the 00 00 ff ff marker in video data.
634 __attribute__((target("default")))
635 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
636 {
637         // No fast path possible unless we have SSE 4.1 or higher.
638         return start;
639 }
640
641 __attribute__((target("sse4.1", "avx2")))
642 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
643 {
644         if (current_frame->data == nullptr ||
645             current_frame->len > current_frame->size ||
646             start == limit) {
647                 return start;
648         }
649         size_t orig_bytes = limit - start;
650         if (orig_bytes < 128) {
651                 // Don't bother.
652                 return start;
653         }
654
655         // Don't read more bytes than we can write.
656         limit = min(limit, start + (current_frame->size - current_frame->len));
657
658         // Align end to 32 bytes.
659         limit = (const uint8_t *)(intptr_t(limit) & ~31);
660
661         if (start >= limit) {
662                 return start;
663         }
664
665         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
666         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
667         if (aligned_start != start) {
668                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
669                 if (sync_start == nullptr) {
670                         add_to_frame(current_frame, "", start, aligned_start);
671                 } else {
672                         add_to_frame(current_frame, "", start, sync_start);
673                         return sync_start;
674                 }
675         }
676
677         // Make the length a multiple of 64.
678         if (current_frame->interleaved) {
679                 if (((limit - aligned_start) % 64) != 0) {
680                         limit -= 32;
681                 }
682                 assert(((limit - aligned_start) % 64) == 0);
683         }
684
685         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
686 }
687
688 __attribute__((target("avx2")))
689 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
690 {
691         const __m256i needle = _mm256_set1_epi8(sync_char);
692
693         size_t bytes_copied;
694         const __restrict __m256i *in = (const __m256i *)aligned_start;
695         if (current_frame->interleaved) {
696                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
697                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
698                 if (current_frame->len % 2 == 1) {
699                         swap(out1, out2);
700                 }
701
702                 __m256i shuffle_cw = _mm256_set_epi8(
703                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
704                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
705                 while (in < (const __m256i *)limit) {
706                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
707                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
708                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
709
710                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
711                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
712                         __m256i found = _mm256_or_si256(found1, found2);
713
714                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
715                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
716                 
717                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
718                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
719
720                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
721                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
722
723                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
724                         _mm256_storeu_si256(out2, hi);
725
726                         if (!_mm256_testz_si256(found, found)) {
727                                 break;
728                         }
729
730                         in += 2;
731                         ++out1;
732                         ++out2;
733                 }
734                 bytes_copied = (uint8_t *)in - aligned_start;
735         } else {
736                 uint8_t *old_end = current_frame->data + current_frame->len;
737                 __m256i *out = (__m256i *)old_end;
738                 while (in < (const __m256i *)limit) {
739                         __m256i data = _mm256_load_si256(in);
740                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
741                         __m256i found = _mm256_cmpeq_epi8(data, needle);
742                         if (!_mm256_testz_si256(found, found)) {
743                                 break;
744                         }
745
746                         ++in;
747                         ++out;
748                 }
749                 bytes_copied = (uint8_t *)out - old_end;
750         }
751         if (current_frame->data_copy != nullptr) {
752                 // TODO: It would be somewhat more cache-efficient to write this in the
753                 // same loop as above. However, it might not be worth the extra complexity.
754                 memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
755         }
756         current_frame->len += bytes_copied;
757
758         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
759         return (const uint8_t *)in;
760 }
761
762 __attribute__((target("sse4.1")))
763 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
764 {
765         const __m128i needle = _mm_set1_epi8(sync_char);
766
767         const __m128i *in = (const __m128i *)aligned_start;
768         size_t bytes_copied;
769         if (current_frame->interleaved) {
770                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
771                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
772                 if (current_frame->len % 2 == 1) {
773                         swap(out1, out2);
774                 }
775
776                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
777                 while (in < (const __m128i *)limit) {
778                         __m128i data1 = _mm_load_si128(in);
779                         __m128i data2 = _mm_load_si128(in + 1);
780                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
781                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
782                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
783                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
784                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
785                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
786                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
787                         _mm_storeu_si128(out2, hi);
788                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
789                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
790                         if (!_mm_testz_si128(found1, found1) ||
791                             !_mm_testz_si128(found2, found2)) {
792                                 break;
793                         }
794
795                         in += 2;
796                         ++out1;
797                         ++out2;
798                 }
799                 bytes_copied = (uint8_t *)in - aligned_start;
800         } else {
801                 uint8_t *old_end = current_frame->data + current_frame->len;
802                 __m128i *out = (__m128i *)old_end;
803                 while (in < (const __m128i *)limit) {
804                         __m128i data = _mm_load_si128(in);
805                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
806                         __m128i found = _mm_cmpeq_epi8(data, needle);
807                         if (!_mm_testz_si128(found, found)) {
808                                 break;
809                         }
810
811                         ++in;
812                         ++out;
813                 }
814                 bytes_copied = (uint8_t *)out - old_end;
815         }
816         if (current_frame->data_copy != nullptr) {
817                 // TODO: It would be somewhat more cache-efficient to write this in the
818                 // same loop as above. However, it might not be worth the extra complexity.
819                 memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
820         }
821         current_frame->len += bytes_copied;
822
823         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
824         return (const uint8_t *)in;
825 }
826
827 #endif  // defined(HAS_MULTIVERSIONING)
828
829 void decode_packs(const libusb_transfer *xfr,
830                   const char *sync_pattern,
831                   int sync_length,
832                   FrameAllocator::Frame *current_frame,
833                   const char *frame_type_name,
834                   function<void(const uint8_t *start)> start_callback)
835 {
836         int offset = 0;
837         for (int i = 0; i < xfr->num_iso_packets; i++) {
838                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
839
840                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
841                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
842                         continue;
843 //exit(5);
844                 }
845
846                 const uint8_t *start = xfr->buffer + offset;
847                 const uint8_t *limit = start + pack->actual_length;
848                 while (start < limit) {  // Usually runs only one iteration.
849                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
850                         if (start == limit) break;
851                         assert(start < limit);
852
853                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
854                         if (start_next_frame == nullptr) {
855                                 // add the rest of the buffer
856                                 add_to_frame(current_frame, frame_type_name, start, limit);
857                                 break;
858                         } else {
859                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
860                                 start = start_next_frame + sync_length;  // skip sync
861                                 start_callback(start);
862                         }
863                 }
864 #if 0
865                 dump_pack(xfr, offset, pack);
866 #endif
867                 offset += pack->length;
868         }
869 }
870
871 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
872 {
873         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
874             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
875                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
876                 libusb_free_transfer(xfr);
877                 exit(3);
878         }
879
880         assert(xfr->user_data != nullptr);
881         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
882
883         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
884                 if (!usb->disconnected) {
885                         fprintf(stderr, "Device went away, stopping transfers.\n");
886                         usb->disconnected = true;
887                         if (usb->card_disconnected_callback) {
888                                 usb->card_disconnected_callback();
889                         }
890                 }
891                 // Don't reschedule the transfer; the loop will stop by itself.
892                 return;
893         }
894
895         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
896                 if (xfr->endpoint == 0x84) {
897                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
898                 } else {
899                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
900
901                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
902                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
903                 }
904         }
905         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
906                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
907                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
908 #if 0
909                 if (setup->wIndex == 44) {
910                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
911                 } else {
912                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
913                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
914                 }
915 #else
916                 memcpy(usb->register_file + usb->current_register, buf, 4);
917                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
918                 if (usb->current_register == 0) {
919                         // read through all of them
920                         printf("register dump:");
921                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
922                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
923                         }
924                         printf("\n");
925                 }
926                 libusb_fill_control_setup(xfr->buffer,
927                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
928                         /*index=*/usb->current_register, /*length=*/4);
929 #endif
930         }
931
932 #if 0
933         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
934         for (i = 0; i < xfr->actual_length; i++) {
935                 printf("%02x", xfr->buffer[i]);
936                 if (i % 16)
937                         printf("\n");
938                 else if (i % 8)
939                         printf("  ");
940                 else
941                         printf(" ");
942         }
943 #endif
944
945         int rc = libusb_submit_transfer(xfr);
946         if (rc < 0) {
947                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
948                 exit(1);
949         }
950 }
951
952 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
953 {
954         if (card_connected_callback != nullptr) {
955                 libusb_device_descriptor desc;
956                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
957                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
958                         libusb_unref_device(dev);
959                         return 1;
960                 }
961
962                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
963                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
964                         card_connected_callback(dev);  // Callback takes ownership.
965                         return 0;
966                 }
967         }
968         libusb_unref_device(dev);
969         return 0;
970 }
971
972 void BMUSBCapture::usb_thread_func()
973 {
974         sched_param param;
975         memset(&param, 0, sizeof(param));
976         param.sched_priority = 1;
977         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
978                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
979         }
980         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
981         while (!should_quit) {
982                 timeval sec { 1, 0 };
983                 int rc = libusb_handle_events_timeout(nullptr, &sec);
984                 if (rc != LIBUSB_SUCCESS)
985                         break;
986         }
987 }
988
989 namespace {
990
991 struct USBCardDevice {
992         uint16_t product;
993         uint8_t bus, port;
994         libusb_device *device;
995 };
996
997 const char *get_product_name(uint16_t product)
998 {
999         if (product == 0xbd3b) {
1000                 return "Intensity Shuttle";
1001         } else if (product == 0xbd4f) {
1002                 return "UltraStudio SDI";
1003         } else {
1004                 assert(false);
1005                 return nullptr;
1006         }
1007 }
1008
1009 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
1010 {
1011         const char *product_name = get_product_name(product);
1012
1013         char buf[256];
1014         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
1015                 id, bus, port, product_name);
1016         return buf;
1017 }
1018
1019 vector<USBCardDevice> find_all_cards()
1020 {
1021         libusb_device **devices;
1022         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
1023         if (num_devices == -1) {
1024                 fprintf(stderr, "Error finding USB devices\n");
1025                 exit(1);
1026         }
1027         vector<USBCardDevice> found_cards;
1028         for (ssize_t i = 0; i < num_devices; ++i) {
1029                 libusb_device_descriptor desc;
1030                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
1031                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
1032                         exit(1);
1033                 }
1034
1035                 uint8_t bus = libusb_get_bus_number(devices[i]);
1036                 uint8_t port = libusb_get_port_number(devices[i]);
1037
1038                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
1039                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
1040                         libusb_unref_device(devices[i]);
1041                         continue;
1042                 }
1043
1044                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
1045         }
1046         libusb_free_device_list(devices, 0);
1047
1048         // Sort the devices to get a consistent ordering.
1049         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
1050                 if (a.product != b.product)
1051                         return a.product < b.product;
1052                 if (a.bus != b.bus)
1053                         return a.bus < b.bus;
1054                 return a.port < b.port;
1055         });
1056
1057         return found_cards;
1058 }
1059
1060 libusb_device_handle *open_card(int card_index, string *description)
1061 {
1062         vector<USBCardDevice> found_cards = find_all_cards();
1063
1064         for (size_t i = 0; i < found_cards.size(); ++i) {
1065                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1066                 fprintf(stderr, "%s\n", tmp_description.c_str());
1067                 if (i == size_t(card_index)) {
1068                         *description = tmp_description;
1069                 }
1070         }
1071
1072         if (size_t(card_index) >= found_cards.size()) {
1073                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1074                 exit(1);
1075         }
1076
1077         libusb_device_handle *devh;
1078         int rc = libusb_open(found_cards[card_index].device, &devh);
1079         if (rc < 0) {
1080                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1081                 exit(1);
1082         }
1083
1084         for (size_t i = 0; i < found_cards.size(); ++i) {
1085                 libusb_unref_device(found_cards[i].device);
1086         }
1087
1088         return devh;
1089 }
1090
1091 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1092 {
1093         uint8_t bus = libusb_get_bus_number(dev);
1094         uint8_t port = libusb_get_port_number(dev);
1095
1096         libusb_device_descriptor desc;
1097         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1098                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1099                 exit(1);
1100         }
1101
1102         *description = get_card_description(card_index, bus, port, desc.idProduct);
1103
1104         libusb_device_handle *devh;
1105         int rc = libusb_open(dev, &devh);
1106         if (rc < 0) {
1107                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1108                 exit(1);
1109         }
1110
1111         return devh;
1112 }
1113
1114 }  // namespace
1115
1116 unsigned BMUSBCapture::num_cards()
1117 {
1118         int rc = libusb_init(nullptr);
1119         if (rc < 0) {
1120                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1121                 exit(1);
1122         }
1123
1124         vector<USBCardDevice> found_cards = find_all_cards();
1125         unsigned ret = found_cards.size();
1126         for (size_t i = 0; i < found_cards.size(); ++i) {
1127                 libusb_unref_device(found_cards[i].device);
1128         }
1129         return ret;
1130 }
1131
1132 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1133 {
1134         current_pixel_format = pixel_format;
1135         update_capture_mode();
1136 }
1137
1138 void BMUSBCapture::configure_card()
1139 {
1140         if (video_frame_allocator == nullptr) {
1141                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1142                 set_video_frame_allocator(owned_video_frame_allocator.get());
1143         }
1144         if (audio_frame_allocator == nullptr) {
1145                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1146                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1147         }
1148         dequeue_thread_should_quit = false;
1149         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1150
1151         int rc;
1152         struct libusb_transfer *xfr;
1153
1154         rc = libusb_init(nullptr);
1155         if (rc < 0) {
1156                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1157                 exit(1);
1158         }
1159
1160         if (dev == nullptr) {
1161                 devh = open_card(card_index, &description);
1162         } else {
1163                 devh = open_card(card_index, dev, &description);
1164                 libusb_unref_device(dev);
1165         }
1166         if (!devh) {
1167                 fprintf(stderr, "Error finding USB device\n");
1168                 exit(1);
1169         }
1170
1171         libusb_config_descriptor *config;
1172         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1173         if (rc < 0) {
1174                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1175                 exit(1);
1176         }
1177
1178 #if 0
1179         printf("%d interface\n", config->bNumInterfaces);
1180         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1181                 printf("  interface %d\n", interface_number);
1182                 const libusb_interface *interface = &config->interface[interface_number];
1183                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1184                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1185                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1186                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1187                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1188                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1189                         }
1190                 }
1191         }
1192 #endif
1193
1194         rc = libusb_set_configuration(devh, /*configuration=*/1);
1195         if (rc < 0) {
1196                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1197                 exit(1);
1198         }
1199
1200         rc = libusb_claim_interface(devh, 0);
1201         if (rc < 0) {
1202                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1203                 exit(1);
1204         }
1205
1206         // Alternate setting 1 is output, alternate setting 2 is input.
1207         // Card is reset when switching alternates, so the driver uses
1208         // this “double switch” when it wants to reset.
1209         //
1210         // There's also alternate settings 3 and 4, which seem to be
1211         // like 1 and 2 except they advertise less bandwidth needed.
1212         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1213         if (rc < 0) {
1214                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1215                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1216                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1217                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1218                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1219                 }
1220                 exit(1);
1221         }
1222         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1223         if (rc < 0) {
1224                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1225                 exit(1);
1226         }
1227 #if 0
1228         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1229         if (rc < 0) {
1230                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1231                 exit(1);
1232         }
1233 #endif
1234
1235 #if 0
1236         rc = libusb_claim_interface(devh, 3);
1237         if (rc < 0) {
1238                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1239                 exit(1);
1240         }
1241 #endif
1242
1243         // theories:
1244         //   44 is some kind of timer register (first 16 bits count upwards)
1245         //   24 is some sort of watchdog?
1246         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1247         //      (or will go to 0x73c60010?), also seen 0x73c60100
1248         //   12 also changes all the time, unclear why  
1249         //   16 seems to be autodetected mode somehow
1250         //      --    this is e00115e0 after reset?
1251         //                    ed0115e0 after mode change [to output?]
1252         //                    2d0015e0 after more mode change [to input]
1253         //                    ed0115e0 after more mode change
1254         //                    2d0015e0 after more mode change
1255         //
1256         //                    390115e0 seems to indicate we have signal
1257         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1258         //
1259         //                    200015e0 on startup
1260         //         changes to 250115e0 when we sync to the signal
1261         //
1262         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1263         //
1264         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1265         //
1266         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1267         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1268         //
1269         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1270         //    perhaps some of them are related to analog output?
1271         //
1272         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1273         //    but the driver sets it to 0x8036802a at some point.
1274         //
1275         //    all of this is on request 214/215. other requests (192, 219,
1276         //    222, 223, 224) are used for firmware upgrade. Probably best to
1277         //    stay out of it unless you know what you're doing.
1278         //
1279         //
1280         // register 16:
1281         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1282         //
1283         // theories:
1284         //   0x01 - stable signal
1285         //   0x04 - deep color
1286         //   0x08 - unknown (audio??)
1287         //   0x20 - 720p??
1288         //   0x30 - 576p??
1289
1290         update_capture_mode();
1291
1292         struct ctrl {
1293                 int endpoint;
1294                 int request;
1295                 int index;
1296                 uint32_t data;
1297         };
1298         static const ctrl ctrls[] = {
1299                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1300                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1301
1302                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1303                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1304                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1305                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1306         };
1307
1308         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1309                 uint32_t flipped = htonl(ctrls[req].data);
1310                 static uint8_t value[4];
1311                 memcpy(value, &flipped, sizeof(flipped));
1312                 int size = sizeof(value);
1313                 //if (ctrls[req].request == 215) size = 0;
1314                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1315                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1316                 if (rc < 0) {
1317                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1318                         exit(1);
1319                 }
1320
1321                 if (ctrls[req].index == 16 && rc == 4) {
1322                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1323                 }
1324
1325 #if 0
1326                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1327                 for (int i = 0; i < rc; ++i) {
1328                         printf("%02x", value[i]);
1329                 }
1330                 printf("\n");
1331 #endif
1332         }
1333
1334 #if 0
1335         // DEBUG
1336         for ( ;; ) {
1337                 static int my_index = 0;
1338                 static uint8_t value[4];
1339                 int size = sizeof(value);
1340                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1341                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1342                 if (rc < 0) {
1343                         fprintf(stderr, "Error on control\n");
1344                         exit(1);
1345                 }
1346                 printf("rc=%d index=%d: 0x", rc, my_index);
1347                 for (int i = 0; i < rc; ++i) {
1348                         printf("%02x", value[i]);
1349                 }
1350                 printf("\n");
1351         }
1352 #endif
1353
1354 #if 0
1355         // set up an asynchronous transfer of the timer register
1356         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1357         static int completed = 0;
1358
1359         xfr = libusb_alloc_transfer(0);
1360         libusb_fill_control_setup(cmdbuf,
1361             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1362                 /*index=*/44, /*length=*/4);
1363         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1364         xfr->user_data = this;
1365         libusb_submit_transfer(xfr);
1366
1367         // set up an asynchronous transfer of register 24
1368         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1369         static int completed2 = 0;
1370
1371         xfr = libusb_alloc_transfer(0);
1372         libusb_fill_control_setup(cmdbuf2,
1373             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1374                 /*index=*/24, /*length=*/4);
1375         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1376         xfr->user_data = this;
1377         libusb_submit_transfer(xfr);
1378 #endif
1379
1380         // set up an asynchronous transfer of the register dump
1381         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1382         static int completed3 = 0;
1383
1384         xfr = libusb_alloc_transfer(0);
1385         libusb_fill_control_setup(cmdbuf3,
1386             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1387                 /*index=*/current_register, /*length=*/4);
1388         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1389         xfr->user_data = this;
1390         //libusb_submit_transfer(xfr);
1391
1392         //audiofp = fopen("audio.raw", "wb");
1393
1394         // set up isochronous transfers for audio and video
1395         for (int e = 3; e <= 4; ++e) {
1396                 int num_transfers = 6;
1397                 for (int i = 0; i < num_transfers; ++i) {
1398                         size_t buf_size;
1399                         int num_iso_pack, size;
1400                         if (e == 3) {
1401                                 // Allocate for minimum width (because that will give us the most
1402                                 // number of packets, so we don't need to reallocate, but we'll
1403                                 // default to 720p for the first frame.
1404                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1405                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1406                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1407                         } else {
1408                                 size = 0xc0;
1409                                 num_iso_pack = 80;
1410                                 buf_size = num_iso_pack * size;
1411                         }
1412                         int num_bytes = num_iso_pack * size;
1413                         assert(size_t(num_bytes) <= buf_size);
1414 #if LIBUSB_API_VERSION >= 0x01000105
1415                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1416 #else
1417                         uint8_t *buf = nullptr;
1418 #endif
1419                         if (buf == nullptr) {
1420                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1421 #if LIBUSB_API_VERSION >= 0x01000105
1422                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1423 #else
1424                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1425 #endif
1426                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1427                                 buf = new uint8_t[num_bytes];
1428                         }
1429
1430                         xfr = libusb_alloc_transfer(num_iso_pack);
1431                         if (!xfr) {
1432                                 fprintf(stderr, "oom\n");
1433                                 exit(1);
1434                         }
1435
1436                         int ep = LIBUSB_ENDPOINT_IN | e;
1437                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1438                                 num_iso_pack, cb_xfr, nullptr, 0);
1439                         libusb_set_iso_packet_lengths(xfr, size);
1440                         xfr->user_data = this;
1441
1442                         if (e == 3) {
1443                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1444                         }
1445
1446                         iso_xfrs.push_back(xfr);
1447                 }
1448         }
1449 }
1450
1451 void BMUSBCapture::start_bm_capture()
1452 {
1453         int i = 0;
1454         for (libusb_transfer *xfr : iso_xfrs) {
1455                 int rc = libusb_submit_transfer(xfr);
1456                 ++i;
1457                 if (rc < 0) {
1458                         //printf("num_bytes=%d\n", num_bytes);
1459                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1460                                 xfr->endpoint, i, libusb_error_name(rc));
1461                         exit(1);
1462                 }
1463         }
1464
1465
1466 #if 0
1467         libusb_release_interface(devh, 0);
1468 out:
1469         if (devh)
1470                 libusb_close(devh);
1471         libusb_exit(nullptr);
1472         return rc;
1473 #endif
1474 }
1475
1476 void BMUSBCapture::stop_dequeue_thread()
1477 {
1478         dequeue_thread_should_quit = true;
1479         queues_not_empty.notify_all();
1480         dequeue_thread.join();
1481 }
1482
1483 void BMUSBCapture::start_bm_thread()
1484 {
1485         // Devices leaving are discovered by seeing the isochronous packets
1486         // coming back with errors, so only care about devices joining.
1487         if (card_connected_callback != nullptr) {
1488                 if (libusb_hotplug_register_callback(
1489                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1490                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1491                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1492                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1493                         exit(1);
1494                 }
1495         }
1496
1497         should_quit = false;
1498         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1499 }
1500
1501 void BMUSBCapture::stop_bm_thread()
1502 {
1503         should_quit = true;
1504         libusb_interrupt_event_handler(nullptr);
1505         usb_thread.join();
1506 }
1507
1508 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1509 {
1510         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1511         VideoMode auto_mode;
1512         auto_mode.name = "Autodetect";
1513         auto_mode.autodetect = true;
1514         return {{ 0, auto_mode }};
1515 }
1516
1517 uint32_t BMUSBCapture::get_current_video_mode() const
1518 {
1519         return 0;  // Matches get_available_video_modes().
1520 }
1521
1522 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1523 {
1524         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1525 }
1526
1527 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1528 {
1529         return {
1530                 { 0x00000000, "HDMI/SDI" },
1531                 { 0x02000000, "Component" },
1532                 { 0x04000000, "Composite" },
1533                 { 0x06000000, "S-video" }
1534         };
1535 }
1536
1537 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1538 {
1539         assert((video_input_id & ~0x06000000) == 0);
1540         current_video_input = video_input_id;
1541         update_capture_mode();
1542 }
1543
1544 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1545 {
1546         return {
1547                 { 0x00000000, "Embedded" },
1548                 { 0x10000000, "Analog" }
1549         };
1550 }
1551
1552 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1553 {
1554         assert((audio_input_id & ~0x10000000) == 0);
1555         current_audio_input = audio_input_id;
1556         update_capture_mode();
1557 }
1558
1559 void BMUSBCapture::update_capture_mode()
1560 {
1561         if (devh == nullptr) {
1562                 return;
1563         }
1564
1565         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1566         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1567         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1568                 mode |= htonl(0x20000000);
1569         } else {
1570                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1571         }
1572
1573         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1574                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1575         if (rc < 0) {
1576                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1577                 exit(1);
1578         }
1579 }
1580
1581 }  // namespace bmusb