]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Add another non-interleaved data copy (intended for VA-API MJPEG uploads).
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.7.2
2 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int v210_stride(int width)
67 {
68         return (width + 5) / 6 * 4 * sizeof(uint32_t);
69 }
70
71 int find_xfer_size_for_width(PixelFormat pixel_format, int width)
72 {
73         // Video seems to require isochronous packets scaled with the width;
74         // seemingly six lines is about right, rounded up to the required 1kB
75         // multiple.
76         // Note that for 10-bit input, you'll need to increase size accordingly.
77         int stride;
78         if (pixel_format == PixelFormat_10BitYCbCr) {
79                 stride = v210_stride(width);
80         } else {
81                 stride = width * sizeof(uint16_t);
82         }
83         int size = stride * 6;
84         if (size % 1024 != 0) {
85                 size &= ~1023;
86                 size += 1024;
87         }
88         return size;
89 }
90
91 void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
92 {
93         assert(width >= MIN_WIDTH);
94         size_t size = find_xfer_size_for_width(pixel_format, width);
95         int num_iso_pack = xfr->length / size;
96         if (num_iso_pack != xfr->num_iso_packets ||
97             size != xfr->iso_packet_desc[0].length) {
98                 xfr->num_iso_packets = num_iso_pack;
99                 libusb_set_iso_packet_lengths(xfr, size);
100         }
101 }
102
103 struct VideoFormatEntry {
104         uint16_t normalized_video_format;
105         unsigned width, height, second_field_start;
106         unsigned extra_lines_top, extra_lines_bottom;
107         unsigned frame_rate_nom, frame_rate_den;
108         bool interlaced;
109 };
110
111 // Get details for the given video format; returns false if detection was incomplete.
112 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
113 {
114         decoded_video_format->id = video_format;
115         decoded_video_format->interlaced = false;
116
117         // TODO: Add these for all formats as we find them.
118         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
119
120         if (video_format == 0x0800) {
121                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
122                 // It's a strange thing, but what can you do.
123                 decoded_video_format->width = 720;
124                 decoded_video_format->height = 525;
125                 decoded_video_format->stride = 720 * 2;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 3013;
129                 decoded_video_format->frame_rate_den = 100;
130                 decoded_video_format->has_signal = false;
131                 return true;
132         }
133         if ((video_format & 0xe000) != 0xe000) {
134                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
135                         video_format);
136                 decoded_video_format->width = 0;
137                 decoded_video_format->height = 0;
138                 decoded_video_format->stride = 0;
139                 decoded_video_format->extra_lines_top = 0;
140                 decoded_video_format->extra_lines_bottom = 0;
141                 decoded_video_format->frame_rate_nom = 60;
142                 decoded_video_format->frame_rate_den = 1;
143                 decoded_video_format->has_signal = false;
144                 return false;
145         }
146
147         decoded_video_format->has_signal = true;
148
149         // NTSC (480i59.94, I suppose). A special case, see below.
150         if ((video_format & ~0x0800) == 0xe101 ||
151             (video_format & ~0x0800) == 0xe1c1 ||
152             (video_format & ~0x0800) == 0xe001) {
153                 decoded_video_format->width = 720;
154                 decoded_video_format->height = 480;
155                 if (video_format & 0x0800) {
156                         decoded_video_format->stride = 720 * 2;
157                 } else {
158                         decoded_video_format->stride = v210_stride(720);
159                 }
160                 decoded_video_format->extra_lines_top = 17;
161                 decoded_video_format->extra_lines_bottom = 28;
162                 decoded_video_format->frame_rate_nom = 30000;
163                 decoded_video_format->frame_rate_den = 1001;
164                 decoded_video_format->second_field_start = 280;
165                 decoded_video_format->interlaced = true;
166                 return true;
167         }
168
169         // PAL (576i50, I suppose). A special case, see below.
170         if ((video_format & ~0x0800) == 0xe109 ||
171             (video_format & ~0x0800) == 0xe1c9 ||
172             (video_format & ~0x0800) == 0xe009 ||
173             (video_format & ~0x0800) == 0xe3e9 ||
174             (video_format & ~0x0800) == 0xe3e1) {
175                 decoded_video_format->width = 720;
176                 decoded_video_format->height = 576;
177                 if (video_format & 0x0800) {
178                         decoded_video_format->stride = 720 * 2;
179                 } else {
180                         decoded_video_format->stride = v210_stride(720);
181                 }
182                 decoded_video_format->extra_lines_top = 22;
183                 decoded_video_format->extra_lines_bottom = 27;
184                 decoded_video_format->frame_rate_nom = 25;
185                 decoded_video_format->frame_rate_den = 1;
186                 decoded_video_format->second_field_start = 335;
187                 decoded_video_format->interlaced = true;
188                 return true;
189         }
190
191         // 0x8 seems to be a flag about availability of deep color on the input,
192         // except when it's not (e.g. it's the only difference between NTSC
193         // and PAL). Rather confusing. But we clear it here nevertheless, because
194         // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
195         // (as opposed to 10-bit).
196         //
197         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
198         uint16_t normalized_video_format = video_format & ~0xe80c;
199         constexpr VideoFormatEntry entries[] = {
200                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
201                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
202                 { 0x0151,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
203                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
204                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
205                 { 0x0161, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
206                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
207                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
208                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
209                 { 0x01c3, 1920, 1080,   0, 41,  4,    30,    1, false },  // 1080p30.
210                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
211                 { 0x01e1, 1920, 1080,   0, 41,  4, 30000, 1001, false },  // 1080p29.97.
212                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
213                 { 0x0063, 1920, 1080,   0, 41,  4,    25,    1, false },  // 1080p25.
214                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
215                 { 0x0083, 1920, 1080,   0, 41,  4,    24,    1, false },  // 1080p24.
216                 { 0x00a1, 1920, 1080,   0, 41,  4, 24000, 1001, false },  // 1080p23.98.
217         };
218         for (const VideoFormatEntry &entry : entries) {
219                 if (normalized_video_format == entry.normalized_video_format) {
220                         decoded_video_format->width = entry.width;
221                         decoded_video_format->height = entry.height;
222                         if (video_format & 0x0800) {
223                                 decoded_video_format->stride = entry.width * 2;
224                         } else {
225                                 decoded_video_format->stride = v210_stride(entry.width);
226                         }
227                         decoded_video_format->second_field_start = entry.second_field_start;
228                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
229                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
230                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
231                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
232                         decoded_video_format->interlaced = entry.interlaced;
233                         return true;
234                 }
235         }
236
237         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
238         decoded_video_format->width = 1280;
239         decoded_video_format->height = 720;
240         decoded_video_format->stride = 1280 * 2;
241         decoded_video_format->frame_rate_nom = 60;
242         decoded_video_format->frame_rate_den = 1;
243         return false;
244 }
245
246 // There are seemingly no direct indicators of sample rate; you just get
247 // one frame's worth and have to guess from that.
248 int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_rate)
249 {
250         size_t num_samples = len / 3 / 8;
251         size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
252
253         // See if we match or are very close to any of the mandatory HDMI sample rates.
254         const int candidate_sample_rates[] = { 32000, 44100, 48000 };
255         for (int rate : candidate_sample_rates) {
256                 if (abs(int(num_samples_per_second) - rate) <= 100) {
257                         return rate;
258                 }
259         }
260
261         fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, keeping capture at %d Hz\n",
262                 num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second, default_rate);
263         return default_rate;
264 }
265
266 }  // namespace
267
268 FrameAllocator::~FrameAllocator() {}
269
270 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
271         : frame_size(frame_size)
272 {
273         for (size_t i = 0; i < num_queued_frames; ++i) {
274                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
275         }
276 }
277
278 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
279 {
280         Frame vf;
281         vf.owner = this;
282
283         unique_lock<mutex> lock(freelist_mutex);  // Meh.
284         if (freelist.empty()) {
285                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
286                         frame_size);
287         } else {
288                 vf.data = freelist.top().release();
289                 vf.size = frame_size;
290                 freelist.pop();  // Meh.
291         }
292         return vf;
293 }
294
295 void MallocFrameAllocator::release_frame(Frame frame)
296 {
297         if (frame.overflow > 0) {
298                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
299         }
300         unique_lock<mutex> lock(freelist_mutex);
301         freelist.push(unique_ptr<uint8_t[]>(frame.data));
302 }
303
304 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
305 {
306         if (a == b) {
307                 return false;
308         } else if (a < b) {
309                 return (b - a < 0x8000);
310         } else {
311                 int wrap_b = 0x10000 + int(b);
312                 return (wrap_b - a < 0x8000);
313         }
314 }
315
316 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
317 {
318         unique_lock<mutex> lock(queue_lock);
319         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
320                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
321                         q->back().timecode, timecode);
322                 frame.owner->release_frame(frame);
323                 return;
324         }
325
326         QueuedFrame qf;
327         qf.format = format;
328         qf.timecode = timecode;
329         qf.frame = frame;
330         q->push_back(move(qf));
331         queues_not_empty.notify_one();  // might be spurious
332 }
333
334 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
335 {
336         FILE *fp = fopen(filename, "wb");
337         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
338                 printf("short write!\n");
339         }
340         fclose(fp);
341 }
342
343 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
344 {
345         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
346 }
347
348 void BMUSBCapture::dequeue_thread_func()
349 {
350         char thread_name[16];
351         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
352         pthread_setname_np(pthread_self(), thread_name);
353
354         if (has_dequeue_callbacks) {
355                 dequeue_init_callback();
356         }
357         size_t last_sample_rate = 48000;
358         while (!dequeue_thread_should_quit) {
359                 unique_lock<mutex> lock(queue_lock);
360                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
361
362                 if (dequeue_thread_should_quit) break;
363
364                 uint16_t video_timecode = pending_video_frames.front().timecode;
365                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
366                 AudioFormat audio_format;
367                 audio_format.bits_per_sample = 24;
368                 audio_format.num_channels = 8;
369                 audio_format.sample_rate = last_sample_rate;
370                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
371                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
372                                 video_timecode);
373                         QueuedFrame video_frame = pending_video_frames.front();
374                         pending_video_frames.pop_front();
375                         lock.unlock();
376                         video_frame_allocator->release_frame(video_frame.frame);
377                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
378                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
379                                 audio_timecode);
380                         QueuedFrame audio_frame = pending_audio_frames.front();
381                         pending_audio_frames.pop_front();
382                         lock.unlock();
383                         audio_format.id = audio_frame.format;
384
385                         // Use the video format of the pending frame.
386                         QueuedFrame video_frame = pending_video_frames.front();
387                         VideoFormat video_format;
388                         decode_video_format(video_frame.format, &video_format);
389
390                         frame_callback(audio_timecode,
391                                        FrameAllocator::Frame(), 0, video_format,
392                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
393                 } else {
394                         QueuedFrame video_frame = pending_video_frames.front();
395                         QueuedFrame audio_frame = pending_audio_frames.front();
396                         pending_audio_frames.pop_front();
397                         pending_video_frames.pop_front();
398                         lock.unlock();
399
400 #if 0
401                         char filename[255];
402                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
403                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
404                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
405 #endif
406
407                         VideoFormat video_format;
408                         audio_format.id = audio_frame.format;
409                         if (decode_video_format(video_frame.format, &video_format)) {
410                                 if (audio_frame.frame.len != 0) {
411                                         audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len, last_sample_rate);
412                                         last_sample_rate = audio_format.sample_rate;
413                                 }
414                                 frame_callback(video_timecode,
415                                                video_frame.frame, HEADER_SIZE, video_format,
416                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
417                         } else {
418                                 video_frame_allocator->release_frame(video_frame.frame);
419                                 audio_format.sample_rate = last_sample_rate;
420                                 frame_callback(video_timecode,
421                                                FrameAllocator::Frame(), 0, video_format,
422                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
423                         }
424                 }
425         }
426         if (has_dequeue_callbacks) {
427                 dequeue_cleanup_callback();
428         }
429 }
430
431 void BMUSBCapture::start_new_frame(const uint8_t *start)
432 {
433         uint16_t format = (start[3] << 8) | start[2];
434         uint16_t timecode = (start[1] << 8) | start[0];
435
436         if (current_video_frame.len > 0) {
437                 current_video_frame.received_timestamp = steady_clock::now();
438
439                 // If format is 0x0800 (no signal), add a fake (empty) audio
440                 // frame to get it out of the queue.
441                 // TODO: Figure out if there are other formats that come with
442                 // no audio, and treat them the same.
443                 if (format == 0x0800) {
444                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
445                         if (fake_audio_frame.data == nullptr) {
446                                 // Oh well, it's just a no-signal frame anyway.
447                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
448                                 current_video_frame.owner->release_frame(current_video_frame);
449                                 current_video_frame = video_frame_allocator->alloc_frame();
450                                 return;
451                         }
452                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
453                 }
454                 //dump_frame();
455                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
456
457                 // Update the assumed frame width. We might be one frame too late on format changes,
458                 // but it's much better than asking the user to choose manually.
459                 VideoFormat video_format;
460                 if (decode_video_format(format, &video_format)) {
461                         assumed_frame_width = video_format.width;
462                 }
463         }
464         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
465         //      format, timecode,
466         //      //start[7], start[6], start[5], start[4],
467         //      read_current_frame, FRAME_SIZE);
468
469         current_video_frame = video_frame_allocator->alloc_frame();
470         //if (current_video_frame.data == nullptr) {
471         //      read_current_frame = -1;
472         //} else {
473         //      read_current_frame = 0;
474         //}
475 }
476
477 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
478 {
479         uint16_t format = (start[3] << 8) | start[2];
480         uint16_t timecode = (start[1] << 8) | start[0];
481         if (current_audio_frame.len > 0) {
482                 current_audio_frame.received_timestamp = steady_clock::now();
483                 //dump_audio_block();
484                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
485         }
486         //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
487         //      format, timecode);
488         current_audio_frame = audio_frame_allocator->alloc_frame();
489 }
490
491 #if 0
492 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
493 {
494         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
495         for (unsigned j = 0; j < pack->actual_length; j++) {
496         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
497                 printf("%02x", xfr->buffer[j + offset]);
498                 if ((j % 16) == 15)
499                         printf("\n");
500                 else if ((j % 8) == 7)
501                         printf("  ");
502                 else
503                         printf(" ");
504         }
505 }
506 #endif
507
508 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
509 {
510         assert(n % 2 == 0);
511         uint8_t *dptr1 = dest1;
512         uint8_t *dptr2 = dest2;
513
514         for (size_t i = 0; i < n; i += 2) {
515                 *dptr1++ = *src++;
516                 *dptr2++ = *src++;
517         }
518 }
519
520 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
521 {
522         if (current_frame->data == nullptr ||
523             current_frame->len > current_frame->size ||
524             start == end) {
525                 return;
526         }
527
528         int bytes = end - start;
529         if (current_frame->len + bytes > current_frame->size) {
530                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
531                 current_frame->len = current_frame->size;
532                 if (current_frame->overflow > 1048576) {
533                         printf("%d bytes overflow after last %s frame\n",
534                                 int(current_frame->overflow), frame_type_name);
535                         current_frame->overflow = 0;
536                 }
537                 //dump_frame();
538         } else {
539                 if (current_frame->data_copy != nullptr) {
540                         memcpy(current_frame->data_copy + current_frame->len, start, bytes);
541                 }
542                 if (current_frame->interleaved) {
543                         uint8_t *data = current_frame->data + current_frame->len / 2;
544                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
545                         if (current_frame->len % 2 == 1) {
546                                 ++data;
547                                 swap(data, data2);
548                         }
549                         if (bytes % 2 == 1) {
550                                 *data++ = *start++;
551                                 swap(data, data2);
552                                 ++current_frame->len;
553                                 --bytes;
554                         }
555                         memcpy_interleaved(data, data2, start, bytes);
556                         current_frame->len += bytes;
557                 } else {
558                         memcpy(current_frame->data + current_frame->len, start, bytes);
559                         current_frame->len += bytes;
560                 }
561         }
562 }
563
564 #if 0
565 void avx2_dump(const char *name, __m256i n)
566 {
567         printf("%-10s:", name);
568         printf(" %02x", _mm256_extract_epi8(n, 0));
569         printf(" %02x", _mm256_extract_epi8(n, 1));
570         printf(" %02x", _mm256_extract_epi8(n, 2));
571         printf(" %02x", _mm256_extract_epi8(n, 3));
572         printf(" %02x", _mm256_extract_epi8(n, 4));
573         printf(" %02x", _mm256_extract_epi8(n, 5));
574         printf(" %02x", _mm256_extract_epi8(n, 6));
575         printf(" %02x", _mm256_extract_epi8(n, 7));
576         printf(" ");
577         printf(" %02x", _mm256_extract_epi8(n, 8));
578         printf(" %02x", _mm256_extract_epi8(n, 9));
579         printf(" %02x", _mm256_extract_epi8(n, 10));
580         printf(" %02x", _mm256_extract_epi8(n, 11));
581         printf(" %02x", _mm256_extract_epi8(n, 12));
582         printf(" %02x", _mm256_extract_epi8(n, 13));
583         printf(" %02x", _mm256_extract_epi8(n, 14));
584         printf(" %02x", _mm256_extract_epi8(n, 15));
585         printf(" ");
586         printf(" %02x", _mm256_extract_epi8(n, 16));
587         printf(" %02x", _mm256_extract_epi8(n, 17));
588         printf(" %02x", _mm256_extract_epi8(n, 18));
589         printf(" %02x", _mm256_extract_epi8(n, 19));
590         printf(" %02x", _mm256_extract_epi8(n, 20));
591         printf(" %02x", _mm256_extract_epi8(n, 21));
592         printf(" %02x", _mm256_extract_epi8(n, 22));
593         printf(" %02x", _mm256_extract_epi8(n, 23));
594         printf(" ");
595         printf(" %02x", _mm256_extract_epi8(n, 24));
596         printf(" %02x", _mm256_extract_epi8(n, 25));
597         printf(" %02x", _mm256_extract_epi8(n, 26));
598         printf(" %02x", _mm256_extract_epi8(n, 27));
599         printf(" %02x", _mm256_extract_epi8(n, 28));
600         printf(" %02x", _mm256_extract_epi8(n, 29));
601         printf(" %02x", _mm256_extract_epi8(n, 30));
602         printf(" %02x", _mm256_extract_epi8(n, 31));
603         printf("\n");
604 }
605 #endif
606
607 #ifndef HAS_MULTIVERSIONING
608
609 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
610 {
611         // No fast path possible unless we have multiversioning.
612         return start;
613 }
614
615 #else  // defined(HAS_MULTIVERSIONING)
616
617 __attribute__((target("sse4.1")))
618 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
619
620 __attribute__((target("avx2")))
621 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
622
623 // Does a memcpy and memchr in one to reduce processing time.
624 // Note that the benefit is somewhat limited if your L3 cache is small,
625 // as you'll (unfortunately) spend most of the time loading the data
626 // from main memory.
627 //
628 // Complicated cases are left to the slow path; it basically stops copying
629 // up until the first instance of "sync_char" (usually a bit before, actually).
630 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
631 // data, and what we really need this for is the 00 00 ff ff marker in video data.
632 __attribute__((target("default")))
633 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
634 {
635         // No fast path possible unless we have SSE 4.1 or higher.
636         return start;
637 }
638
639 __attribute__((target("sse4.1", "avx2")))
640 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
641 {
642         if (current_frame->data == nullptr ||
643             current_frame->len > current_frame->size ||
644             start == limit) {
645                 return start;
646         }
647         size_t orig_bytes = limit - start;
648         if (orig_bytes < 128) {
649                 // Don't bother.
650                 return start;
651         }
652
653         // Don't read more bytes than we can write.
654         limit = min(limit, start + (current_frame->size - current_frame->len));
655
656         // Align end to 32 bytes.
657         limit = (const uint8_t *)(intptr_t(limit) & ~31);
658
659         if (start >= limit) {
660                 return start;
661         }
662
663         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
664         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
665         if (aligned_start != start) {
666                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
667                 if (sync_start == nullptr) {
668                         add_to_frame(current_frame, "", start, aligned_start);
669                 } else {
670                         add_to_frame(current_frame, "", start, sync_start);
671                         return sync_start;
672                 }
673         }
674
675         // Make the length a multiple of 64.
676         if (current_frame->interleaved) {
677                 if (((limit - aligned_start) % 64) != 0) {
678                         limit -= 32;
679                 }
680                 assert(((limit - aligned_start) % 64) == 0);
681         }
682
683         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
684 }
685
686 __attribute__((target("avx2")))
687 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
688 {
689         const __m256i needle = _mm256_set1_epi8(sync_char);
690
691         size_t bytes_copied;
692         const __restrict __m256i *in = (const __m256i *)aligned_start;
693         if (current_frame->interleaved) {
694                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
695                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
696                 if (current_frame->len % 2 == 1) {
697                         swap(out1, out2);
698                 }
699
700                 __m256i shuffle_cw = _mm256_set_epi8(
701                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
702                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
703                 while (in < (const __m256i *)limit) {
704                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
705                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
706                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
707
708                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
709                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
710                         __m256i found = _mm256_or_si256(found1, found2);
711
712                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
713                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
714                 
715                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
716                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
717
718                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
719                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
720
721                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
722                         _mm256_storeu_si256(out2, hi);
723
724                         if (!_mm256_testz_si256(found, found)) {
725                                 break;
726                         }
727
728                         in += 2;
729                         ++out1;
730                         ++out2;
731                 }
732                 bytes_copied = (uint8_t *)in - aligned_start;
733         } else {
734                 uint8_t *old_end = current_frame->data + current_frame->len;
735                 __m256i *out = (__m256i *)old_end;
736                 while (in < (const __m256i *)limit) {
737                         __m256i data = _mm256_load_si256(in);
738                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
739                         __m256i found = _mm256_cmpeq_epi8(data, needle);
740                         if (!_mm256_testz_si256(found, found)) {
741                                 break;
742                         }
743
744                         ++in;
745                         ++out;
746                 }
747                 bytes_copied = (uint8_t *)out - old_end;
748         }
749         if (current_frame->data_copy != nullptr) {
750                 // TODO: It would be somewhat more cache-efficient to write this in the
751                 // same loop as above. However, it might not be worth the extra complexity.
752                 memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
753         }
754         current_frame->len += bytes_copied;
755
756         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
757         return (const uint8_t *)in;
758 }
759
760 __attribute__((target("sse4.1")))
761 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
762 {
763         const __m128i needle = _mm_set1_epi8(sync_char);
764
765         const __m128i *in = (const __m128i *)aligned_start;
766         size_t bytes_copied;
767         if (current_frame->interleaved) {
768                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
769                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
770                 if (current_frame->len % 2 == 1) {
771                         swap(out1, out2);
772                 }
773
774                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
775                 while (in < (const __m128i *)limit) {
776                         __m128i data1 = _mm_load_si128(in);
777                         __m128i data2 = _mm_load_si128(in + 1);
778                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
779                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
780                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
781                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
782                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
783                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
784                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
785                         _mm_storeu_si128(out2, hi);
786                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
787                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
788                         if (!_mm_testz_si128(found1, found1) ||
789                             !_mm_testz_si128(found2, found2)) {
790                                 break;
791                         }
792
793                         in += 2;
794                         ++out1;
795                         ++out2;
796                 }
797                 bytes_copied = (uint8_t *)in - aligned_start;
798         } else {
799                 uint8_t *old_end = current_frame->data + current_frame->len;
800                 __m128i *out = (__m128i *)old_end;
801                 while (in < (const __m128i *)limit) {
802                         __m128i data = _mm_load_si128(in);
803                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
804                         __m128i found = _mm_cmpeq_epi8(data, needle);
805                         if (!_mm_testz_si128(found, found)) {
806                                 break;
807                         }
808
809                         ++in;
810                         ++out;
811                 }
812                 bytes_copied = (uint8_t *)out - old_end;
813         }
814         if (current_frame->data_copy != nullptr) {
815                 // TODO: It would be somewhat more cache-efficient to write this in the
816                 // same loop as above. However, it might not be worth the extra complexity.
817                 memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
818         }
819         current_frame->len += bytes_copied;
820
821         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
822         return (const uint8_t *)in;
823 }
824
825 #endif  // defined(HAS_MULTIVERSIONING)
826
827 void decode_packs(const libusb_transfer *xfr,
828                   const char *sync_pattern,
829                   int sync_length,
830                   FrameAllocator::Frame *current_frame,
831                   const char *frame_type_name,
832                   function<void(const uint8_t *start)> start_callback)
833 {
834         int offset = 0;
835         for (int i = 0; i < xfr->num_iso_packets; i++) {
836                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
837
838                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
839                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
840                         continue;
841 //exit(5);
842                 }
843
844                 const uint8_t *start = xfr->buffer + offset;
845                 const uint8_t *limit = start + pack->actual_length;
846                 while (start < limit) {  // Usually runs only one iteration.
847                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
848                         if (start == limit) break;
849                         assert(start < limit);
850
851                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
852                         if (start_next_frame == nullptr) {
853                                 // add the rest of the buffer
854                                 add_to_frame(current_frame, frame_type_name, start, limit);
855                                 break;
856                         } else {
857                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
858                                 start = start_next_frame + sync_length;  // skip sync
859                                 start_callback(start);
860                         }
861                 }
862 #if 0
863                 dump_pack(xfr, offset, pack);
864 #endif
865                 offset += pack->length;
866         }
867 }
868
869 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
870 {
871         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
872             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
873                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
874                 libusb_free_transfer(xfr);
875                 exit(3);
876         }
877
878         assert(xfr->user_data != nullptr);
879         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
880
881         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
882                 if (!usb->disconnected) {
883                         fprintf(stderr, "Device went away, stopping transfers.\n");
884                         usb->disconnected = true;
885                         if (usb->card_disconnected_callback) {
886                                 usb->card_disconnected_callback();
887                         }
888                 }
889                 // Don't reschedule the transfer; the loop will stop by itself.
890                 return;
891         }
892
893         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
894                 if (xfr->endpoint == 0x84) {
895                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
896                 } else {
897                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
898
899                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
900                         change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
901                 }
902         }
903         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
904                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
905                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
906 #if 0
907                 if (setup->wIndex == 44) {
908                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
909                 } else {
910                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
911                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
912                 }
913 #else
914                 memcpy(usb->register_file + usb->current_register, buf, 4);
915                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
916                 if (usb->current_register == 0) {
917                         // read through all of them
918                         printf("register dump:");
919                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
920                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
921                         }
922                         printf("\n");
923                 }
924                 libusb_fill_control_setup(xfr->buffer,
925                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
926                         /*index=*/usb->current_register, /*length=*/4);
927 #endif
928         }
929
930 #if 0
931         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
932         for (i = 0; i < xfr->actual_length; i++) {
933                 printf("%02x", xfr->buffer[i]);
934                 if (i % 16)
935                         printf("\n");
936                 else if (i % 8)
937                         printf("  ");
938                 else
939                         printf(" ");
940         }
941 #endif
942
943         int rc = libusb_submit_transfer(xfr);
944         if (rc < 0) {
945                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
946                 exit(1);
947         }
948 }
949
950 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
951 {
952         if (card_connected_callback != nullptr) {
953                 libusb_device_descriptor desc;
954                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
955                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
956                         libusb_unref_device(dev);
957                         return 1;
958                 }
959
960                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
961                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
962                         card_connected_callback(dev);  // Callback takes ownership.
963                         return 0;
964                 }
965         }
966         libusb_unref_device(dev);
967         return 0;
968 }
969
970 void BMUSBCapture::usb_thread_func()
971 {
972         sched_param param;
973         memset(&param, 0, sizeof(param));
974         param.sched_priority = 1;
975         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
976                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
977         }
978         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
979         while (!should_quit) {
980                 timeval sec { 1, 0 };
981                 int rc = libusb_handle_events_timeout(nullptr, &sec);
982                 if (rc != LIBUSB_SUCCESS)
983                         break;
984         }
985 }
986
987 namespace {
988
989 struct USBCardDevice {
990         uint16_t product;
991         uint8_t bus, port;
992         libusb_device *device;
993 };
994
995 const char *get_product_name(uint16_t product)
996 {
997         if (product == 0xbd3b) {
998                 return "Intensity Shuttle";
999         } else if (product == 0xbd4f) {
1000                 return "UltraStudio SDI";
1001         } else {
1002                 assert(false);
1003                 return nullptr;
1004         }
1005 }
1006
1007 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
1008 {
1009         const char *product_name = get_product_name(product);
1010
1011         char buf[256];
1012         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
1013                 id, bus, port, product_name);
1014         return buf;
1015 }
1016
1017 vector<USBCardDevice> find_all_cards()
1018 {
1019         libusb_device **devices;
1020         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
1021         if (num_devices == -1) {
1022                 fprintf(stderr, "Error finding USB devices\n");
1023                 exit(1);
1024         }
1025         vector<USBCardDevice> found_cards;
1026         for (ssize_t i = 0; i < num_devices; ++i) {
1027                 libusb_device_descriptor desc;
1028                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
1029                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
1030                         exit(1);
1031                 }
1032
1033                 uint8_t bus = libusb_get_bus_number(devices[i]);
1034                 uint8_t port = libusb_get_port_number(devices[i]);
1035
1036                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
1037                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
1038                         libusb_unref_device(devices[i]);
1039                         continue;
1040                 }
1041
1042                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
1043         }
1044         libusb_free_device_list(devices, 0);
1045
1046         // Sort the devices to get a consistent ordering.
1047         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
1048                 if (a.product != b.product)
1049                         return a.product < b.product;
1050                 if (a.bus != b.bus)
1051                         return a.bus < b.bus;
1052                 return a.port < b.port;
1053         });
1054
1055         return found_cards;
1056 }
1057
1058 libusb_device_handle *open_card(int card_index, string *description)
1059 {
1060         vector<USBCardDevice> found_cards = find_all_cards();
1061
1062         for (size_t i = 0; i < found_cards.size(); ++i) {
1063                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
1064                 fprintf(stderr, "%s\n", tmp_description.c_str());
1065                 if (i == size_t(card_index)) {
1066                         *description = tmp_description;
1067                 }
1068         }
1069
1070         if (size_t(card_index) >= found_cards.size()) {
1071                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
1072                 exit(1);
1073         }
1074
1075         libusb_device_handle *devh;
1076         int rc = libusb_open(found_cards[card_index].device, &devh);
1077         if (rc < 0) {
1078                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1079                 exit(1);
1080         }
1081
1082         for (size_t i = 0; i < found_cards.size(); ++i) {
1083                 libusb_unref_device(found_cards[i].device);
1084         }
1085
1086         return devh;
1087 }
1088
1089 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1090 {
1091         uint8_t bus = libusb_get_bus_number(dev);
1092         uint8_t port = libusb_get_port_number(dev);
1093
1094         libusb_device_descriptor desc;
1095         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1096                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1097                 exit(1);
1098         }
1099
1100         *description = get_card_description(card_index, bus, port, desc.idProduct);
1101
1102         libusb_device_handle *devh;
1103         int rc = libusb_open(dev, &devh);
1104         if (rc < 0) {
1105                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1106                 exit(1);
1107         }
1108
1109         return devh;
1110 }
1111
1112 }  // namespace
1113
1114 unsigned BMUSBCapture::num_cards()
1115 {
1116         int rc = libusb_init(nullptr);
1117         if (rc < 0) {
1118                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1119                 exit(1);
1120         }
1121
1122         vector<USBCardDevice> found_cards = find_all_cards();
1123         unsigned ret = found_cards.size();
1124         for (size_t i = 0; i < found_cards.size(); ++i) {
1125                 libusb_unref_device(found_cards[i].device);
1126         }
1127         return ret;
1128 }
1129
1130 void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
1131 {
1132         current_pixel_format = pixel_format;
1133         update_capture_mode();
1134 }
1135
1136 void BMUSBCapture::configure_card()
1137 {
1138         if (video_frame_allocator == nullptr) {
1139                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1140                 set_video_frame_allocator(owned_video_frame_allocator.get());
1141         }
1142         if (audio_frame_allocator == nullptr) {
1143                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1144                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1145         }
1146         dequeue_thread_should_quit = false;
1147         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1148
1149         int rc;
1150         struct libusb_transfer *xfr;
1151
1152         rc = libusb_init(nullptr);
1153         if (rc < 0) {
1154                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1155                 exit(1);
1156         }
1157
1158         if (dev == nullptr) {
1159                 devh = open_card(card_index, &description);
1160         } else {
1161                 devh = open_card(card_index, dev, &description);
1162                 libusb_unref_device(dev);
1163         }
1164         if (!devh) {
1165                 fprintf(stderr, "Error finding USB device\n");
1166                 exit(1);
1167         }
1168
1169         libusb_config_descriptor *config;
1170         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1171         if (rc < 0) {
1172                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1173                 exit(1);
1174         }
1175
1176 #if 0
1177         printf("%d interface\n", config->bNumInterfaces);
1178         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1179                 printf("  interface %d\n", interface_number);
1180                 const libusb_interface *interface = &config->interface[interface_number];
1181                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1182                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1183                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1184                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1185                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1186                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1187                         }
1188                 }
1189         }
1190 #endif
1191
1192         rc = libusb_set_configuration(devh, /*configuration=*/1);
1193         if (rc < 0) {
1194                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1195                 exit(1);
1196         }
1197
1198         rc = libusb_claim_interface(devh, 0);
1199         if (rc < 0) {
1200                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1201                 exit(1);
1202         }
1203
1204         // Alternate setting 1 is output, alternate setting 2 is input.
1205         // Card is reset when switching alternates, so the driver uses
1206         // this “double switch” when it wants to reset.
1207         //
1208         // There's also alternate settings 3 and 4, which seem to be
1209         // like 1 and 2 except they advertise less bandwidth needed.
1210         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1211         if (rc < 0) {
1212                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1213                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1214                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1215                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1216                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1217                 }
1218                 exit(1);
1219         }
1220         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1221         if (rc < 0) {
1222                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1223                 exit(1);
1224         }
1225 #if 0
1226         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1227         if (rc < 0) {
1228                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1229                 exit(1);
1230         }
1231 #endif
1232
1233 #if 0
1234         rc = libusb_claim_interface(devh, 3);
1235         if (rc < 0) {
1236                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1237                 exit(1);
1238         }
1239 #endif
1240
1241         // theories:
1242         //   44 is some kind of timer register (first 16 bits count upwards)
1243         //   24 is some sort of watchdog?
1244         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1245         //      (or will go to 0x73c60010?), also seen 0x73c60100
1246         //   12 also changes all the time, unclear why  
1247         //   16 seems to be autodetected mode somehow
1248         //      --    this is e00115e0 after reset?
1249         //                    ed0115e0 after mode change [to output?]
1250         //                    2d0015e0 after more mode change [to input]
1251         //                    ed0115e0 after more mode change
1252         //                    2d0015e0 after more mode change
1253         //
1254         //                    390115e0 seems to indicate we have signal
1255         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1256         //
1257         //                    200015e0 on startup
1258         //         changes to 250115e0 when we sync to the signal
1259         //
1260         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1261         //
1262         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1263         //
1264         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1265         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1266         //
1267         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1268         //    perhaps some of them are related to analog output?
1269         //
1270         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1271         //    but the driver sets it to 0x8036802a at some point.
1272         //
1273         //    all of this is on request 214/215. other requests (192, 219,
1274         //    222, 223, 224) are used for firmware upgrade. Probably best to
1275         //    stay out of it unless you know what you're doing.
1276         //
1277         //
1278         // register 16:
1279         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1280         //
1281         // theories:
1282         //   0x01 - stable signal
1283         //   0x04 - deep color
1284         //   0x08 - unknown (audio??)
1285         //   0x20 - 720p??
1286         //   0x30 - 576p??
1287
1288         update_capture_mode();
1289
1290         struct ctrl {
1291                 int endpoint;
1292                 int request;
1293                 int index;
1294                 uint32_t data;
1295         };
1296         static const ctrl ctrls[] = {
1297                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1298                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1299
1300                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1301                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1302                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1303                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1304         };
1305
1306         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1307                 uint32_t flipped = htonl(ctrls[req].data);
1308                 static uint8_t value[4];
1309                 memcpy(value, &flipped, sizeof(flipped));
1310                 int size = sizeof(value);
1311                 //if (ctrls[req].request == 215) size = 0;
1312                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1313                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1314                 if (rc < 0) {
1315                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1316                         exit(1);
1317                 }
1318
1319                 if (ctrls[req].index == 16 && rc == 4) {
1320                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1321                 }
1322
1323 #if 0
1324                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1325                 for (int i = 0; i < rc; ++i) {
1326                         printf("%02x", value[i]);
1327                 }
1328                 printf("\n");
1329 #endif
1330         }
1331
1332 #if 0
1333         // DEBUG
1334         for ( ;; ) {
1335                 static int my_index = 0;
1336                 static uint8_t value[4];
1337                 int size = sizeof(value);
1338                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1339                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1340                 if (rc < 0) {
1341                         fprintf(stderr, "Error on control\n");
1342                         exit(1);
1343                 }
1344                 printf("rc=%d index=%d: 0x", rc, my_index);
1345                 for (int i = 0; i < rc; ++i) {
1346                         printf("%02x", value[i]);
1347                 }
1348                 printf("\n");
1349         }
1350 #endif
1351
1352 #if 0
1353         // set up an asynchronous transfer of the timer register
1354         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1355         static int completed = 0;
1356
1357         xfr = libusb_alloc_transfer(0);
1358         libusb_fill_control_setup(cmdbuf,
1359             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1360                 /*index=*/44, /*length=*/4);
1361         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1362         xfr->user_data = this;
1363         libusb_submit_transfer(xfr);
1364
1365         // set up an asynchronous transfer of register 24
1366         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1367         static int completed2 = 0;
1368
1369         xfr = libusb_alloc_transfer(0);
1370         libusb_fill_control_setup(cmdbuf2,
1371             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1372                 /*index=*/24, /*length=*/4);
1373         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1374         xfr->user_data = this;
1375         libusb_submit_transfer(xfr);
1376 #endif
1377
1378         // set up an asynchronous transfer of the register dump
1379         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1380         static int completed3 = 0;
1381
1382         xfr = libusb_alloc_transfer(0);
1383         libusb_fill_control_setup(cmdbuf3,
1384             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1385                 /*index=*/current_register, /*length=*/4);
1386         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1387         xfr->user_data = this;
1388         //libusb_submit_transfer(xfr);
1389
1390         //audiofp = fopen("audio.raw", "wb");
1391
1392         // set up isochronous transfers for audio and video
1393         for (int e = 3; e <= 4; ++e) {
1394                 int num_transfers = 6;
1395                 for (int i = 0; i < num_transfers; ++i) {
1396                         size_t buf_size;
1397                         int num_iso_pack, size;
1398                         if (e == 3) {
1399                                 // Allocate for minimum width (because that will give us the most
1400                                 // number of packets, so we don't need to reallocate, but we'll
1401                                 // default to 720p for the first frame.
1402                                 size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
1403                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1404                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1405                         } else {
1406                                 size = 0xc0;
1407                                 num_iso_pack = 80;
1408                                 buf_size = num_iso_pack * size;
1409                         }
1410                         int num_bytes = num_iso_pack * size;
1411                         assert(size_t(num_bytes) <= buf_size);
1412 #if LIBUSB_API_VERSION >= 0x01000105
1413                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1414 #else
1415                         uint8_t *buf = nullptr;
1416 #endif
1417                         if (buf == nullptr) {
1418                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1419 #if LIBUSB_API_VERSION >= 0x01000105
1420                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1421 #else
1422                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1423 #endif
1424                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1425                                 buf = new uint8_t[num_bytes];
1426                         }
1427
1428                         xfr = libusb_alloc_transfer(num_iso_pack);
1429                         if (!xfr) {
1430                                 fprintf(stderr, "oom\n");
1431                                 exit(1);
1432                         }
1433
1434                         int ep = LIBUSB_ENDPOINT_IN | e;
1435                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1436                                 num_iso_pack, cb_xfr, nullptr, 0);
1437                         libusb_set_iso_packet_lengths(xfr, size);
1438                         xfr->user_data = this;
1439
1440                         if (e == 3) {
1441                                 change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
1442                         }
1443
1444                         iso_xfrs.push_back(xfr);
1445                 }
1446         }
1447 }
1448
1449 void BMUSBCapture::start_bm_capture()
1450 {
1451         int i = 0;
1452         for (libusb_transfer *xfr : iso_xfrs) {
1453                 int rc = libusb_submit_transfer(xfr);
1454                 ++i;
1455                 if (rc < 0) {
1456                         //printf("num_bytes=%d\n", num_bytes);
1457                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1458                                 xfr->endpoint, i, libusb_error_name(rc));
1459                         exit(1);
1460                 }
1461         }
1462
1463
1464 #if 0
1465         libusb_release_interface(devh, 0);
1466 out:
1467         if (devh)
1468                 libusb_close(devh);
1469         libusb_exit(nullptr);
1470         return rc;
1471 #endif
1472 }
1473
1474 void BMUSBCapture::stop_dequeue_thread()
1475 {
1476         dequeue_thread_should_quit = true;
1477         queues_not_empty.notify_all();
1478         dequeue_thread.join();
1479 }
1480
1481 void BMUSBCapture::start_bm_thread()
1482 {
1483         // Devices leaving are discovered by seeing the isochronous packets
1484         // coming back with errors, so only care about devices joining.
1485         if (card_connected_callback != nullptr) {
1486                 if (libusb_hotplug_register_callback(
1487                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1488                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1489                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1490                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1491                         exit(1);
1492                 }
1493         }
1494
1495         should_quit = false;
1496         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1497 }
1498
1499 void BMUSBCapture::stop_bm_thread()
1500 {
1501         should_quit = true;
1502         libusb_interrupt_event_handler(nullptr);
1503         usb_thread.join();
1504 }
1505
1506 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1507 {
1508         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1509         VideoMode auto_mode;
1510         auto_mode.name = "Autodetect";
1511         auto_mode.autodetect = true;
1512         return {{ 0, auto_mode }};
1513 }
1514
1515 uint32_t BMUSBCapture::get_current_video_mode() const
1516 {
1517         return 0;  // Matches get_available_video_modes().
1518 }
1519
1520 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1521 {
1522         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1523 }
1524
1525 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1526 {
1527         return {
1528                 { 0x00000000, "HDMI/SDI" },
1529                 { 0x02000000, "Component" },
1530                 { 0x04000000, "Composite" },
1531                 { 0x06000000, "S-video" }
1532         };
1533 }
1534
1535 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1536 {
1537         assert((video_input_id & ~0x06000000) == 0);
1538         current_video_input = video_input_id;
1539         update_capture_mode();
1540 }
1541
1542 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1543 {
1544         return {
1545                 { 0x00000000, "Embedded" },
1546                 { 0x10000000, "Analog" }
1547         };
1548 }
1549
1550 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1551 {
1552         assert((audio_input_id & ~0x10000000) == 0);
1553         current_audio_input = audio_input_id;
1554         update_capture_mode();
1555 }
1556
1557 void BMUSBCapture::update_capture_mode()
1558 {
1559         if (devh == nullptr) {
1560                 return;
1561         }
1562
1563         // Clearing the 0x08000000 bit seems to change the capture format (other source?).
1564         uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
1565         if (current_pixel_format == PixelFormat_8BitYCbCr) {
1566                 mode |= htonl(0x20000000);
1567         } else {
1568                 assert(current_pixel_format == PixelFormat_10BitYCbCr);
1569         }
1570
1571         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1572                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1573         if (rc < 0) {
1574                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1575                 exit(1);
1576         }
1577 }
1578
1579 }  // namespace bmusb