]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Add a stride field to VideoFormat.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.5.4
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int find_xfer_size_for_width(int width)
67 {
68         // Video seems to require isochronous packets scaled with the width;
69         // seemingly six lines is about right, rounded up to the required 1kB
70         // multiple.
71         int size = width * 2 * 6;
72         // Note that for 10-bit input, you'll need to increase size accordingly.
73         //size = size * 4 / 3;
74         if (size % 1024 != 0) {
75                 size &= ~1023;
76                 size += 1024;
77         }
78         return size;
79 }
80
81 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
82 {
83         assert(width >= MIN_WIDTH);
84         size_t size = find_xfer_size_for_width(width);
85         int num_iso_pack = xfr->length / size;
86         if (num_iso_pack != xfr->num_iso_packets ||
87             size != xfr->iso_packet_desc[0].length) {
88                 xfr->num_iso_packets = num_iso_pack;
89                 libusb_set_iso_packet_lengths(xfr, size);
90         }
91 }
92
93 struct VideoFormatEntry {
94         uint16_t normalized_video_format;
95         unsigned width, height, second_field_start;
96         unsigned extra_lines_top, extra_lines_bottom;
97         unsigned frame_rate_nom, frame_rate_den;
98         bool interlaced;
99 };
100
101 // Get details for the given video format; returns false if detection was incomplete.
102 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
103 {
104         decoded_video_format->id = video_format;
105         decoded_video_format->interlaced = false;
106
107         // TODO: Add these for all formats as we find them.
108         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
109
110         if (video_format == 0x0800) {
111                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
112                 // It's a strange thing, but what can you do.
113                 decoded_video_format->width = 720;
114                 decoded_video_format->height = 525;
115                 decoded_video_format->stride = 720 * 2;
116                 decoded_video_format->extra_lines_top = 0;
117                 decoded_video_format->extra_lines_bottom = 0;
118                 decoded_video_format->frame_rate_nom = 3013;
119                 decoded_video_format->frame_rate_den = 100;
120                 decoded_video_format->has_signal = false;
121                 return true;
122         }
123         if ((video_format & 0xe800) != 0xe800) {
124                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
125                         video_format);
126                 decoded_video_format->width = 0;
127                 decoded_video_format->height = 0;
128                 decoded_video_format->stride = 0;
129                 decoded_video_format->extra_lines_top = 0;
130                 decoded_video_format->extra_lines_bottom = 0;
131                 decoded_video_format->frame_rate_nom = 60;
132                 decoded_video_format->frame_rate_den = 1;
133                 decoded_video_format->has_signal = false;
134                 return false;
135         }
136
137         decoded_video_format->has_signal = true;
138
139         // NTSC (480i59.94, I suppose). A special case, see below.
140         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
141                 decoded_video_format->width = 720;
142                 decoded_video_format->height = 480;
143                 decoded_video_format->stride = 720 * 2;
144                 decoded_video_format->extra_lines_top = 17;
145                 decoded_video_format->extra_lines_bottom = 28;
146                 decoded_video_format->frame_rate_nom = 30000;
147                 decoded_video_format->frame_rate_den = 1001;
148                 decoded_video_format->second_field_start = 280;
149                 decoded_video_format->interlaced = true;
150                 return true;
151         }
152
153         // PAL (576i50, I suppose). A special case, see below.
154         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
155                 decoded_video_format->width = 720;
156                 decoded_video_format->height = 576;
157                 decoded_video_format->stride = 720 * 2;
158                 decoded_video_format->extra_lines_top = 22;
159                 decoded_video_format->extra_lines_bottom = 27;
160                 decoded_video_format->frame_rate_nom = 25;
161                 decoded_video_format->frame_rate_den = 1;
162                 decoded_video_format->second_field_start = 335;
163                 decoded_video_format->interlaced = true;
164                 return true;
165         }
166
167         // 0x8 seems to be a flag about availability of deep color on the input,
168         // except when it's not (e.g. it's the only difference between NTSC
169         // and PAL). Rather confusing. But we clear it here nevertheless, because
170         // usually it doesn't mean anything.
171         //
172         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
173         uint16_t normalized_video_format = video_format & ~0xe80c;
174         constexpr VideoFormatEntry entries[] = {
175                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
176                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
177                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
178                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
179                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
180                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
181                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
182                 { 0x01c3, 1920, 1080,   0, 20, 25,    30,    1, false },  // 1080p30.
183                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
184                 { 0x01e1, 1920, 1080,   0, 20, 25, 30000, 1001, false },  // 1080p29.97.
185                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
186                 { 0x0063, 1920, 1080,   0, 20, 25,    25,    1, false },  // 1080p25.
187                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
188                 { 0x0083, 1920, 1080,   0, 20, 25,    24,    1, false },  // 1080p24.
189                 { 0x00a1, 1920, 1080,   0, 20, 25, 24000, 1001, false },  // 1080p23.98.
190         };
191         for (const VideoFormatEntry &entry : entries) {
192                 if (normalized_video_format == entry.normalized_video_format) {
193                         decoded_video_format->width = entry.width;
194                         decoded_video_format->height = entry.height;
195                         decoded_video_format->stride = entry.width * 2;
196                         decoded_video_format->second_field_start = entry.second_field_start;
197                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
198                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
199                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
200                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
201                         decoded_video_format->interlaced = entry.interlaced;
202                         return true;
203                 }
204         }
205
206         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
207         decoded_video_format->width = 1280;
208         decoded_video_format->height = 720;
209         decoded_video_format->stride = 1280 * 2;
210         decoded_video_format->frame_rate_nom = 60;
211         decoded_video_format->frame_rate_den = 1;
212         return false;
213 }
214
215 }  // namespace
216
217 FrameAllocator::~FrameAllocator() {}
218
219 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
220         : frame_size(frame_size)
221 {
222         for (size_t i = 0; i < num_queued_frames; ++i) {
223                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
224         }
225 }
226
227 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
228 {
229         Frame vf;
230         vf.owner = this;
231
232         unique_lock<mutex> lock(freelist_mutex);  // Meh.
233         if (freelist.empty()) {
234                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
235                         frame_size);
236         } else {
237                 vf.data = freelist.top().release();
238                 vf.size = frame_size;
239                 freelist.pop();  // Meh.
240         }
241         return vf;
242 }
243
244 void MallocFrameAllocator::release_frame(Frame frame)
245 {
246         if (frame.overflow > 0) {
247                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
248         }
249         unique_lock<mutex> lock(freelist_mutex);
250         freelist.push(unique_ptr<uint8_t[]>(frame.data));
251 }
252
253 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
254 {
255         if (a == b) {
256                 return false;
257         } else if (a < b) {
258                 return (b - a < 0x8000);
259         } else {
260                 int wrap_b = 0x10000 + int(b);
261                 return (wrap_b - a < 0x8000);
262         }
263 }
264
265 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
266 {
267         unique_lock<mutex> lock(queue_lock);
268         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
269                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
270                         q->back().timecode, timecode);
271                 frame.owner->release_frame(frame);
272                 return;
273         }
274
275         QueuedFrame qf;
276         qf.format = format;
277         qf.timecode = timecode;
278         qf.frame = frame;
279         q->push_back(move(qf));
280         queues_not_empty.notify_one();  // might be spurious
281 }
282
283 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
284 {
285         FILE *fp = fopen(filename, "wb");
286         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
287                 printf("short write!\n");
288         }
289         fclose(fp);
290 }
291
292 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
293 {
294         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
295 }
296
297 void BMUSBCapture::dequeue_thread_func()
298 {
299         char thread_name[16];
300         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
301         pthread_setname_np(pthread_self(), thread_name);
302
303         if (has_dequeue_callbacks) {
304                 dequeue_init_callback();
305         }
306         while (!dequeue_thread_should_quit) {
307                 unique_lock<mutex> lock(queue_lock);
308                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
309
310                 if (dequeue_thread_should_quit) break;
311
312                 uint16_t video_timecode = pending_video_frames.front().timecode;
313                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
314                 AudioFormat audio_format;
315                 audio_format.bits_per_sample = 24;
316                 audio_format.num_channels = 8;
317                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
318                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
319                                 video_timecode);
320                         QueuedFrame video_frame = pending_video_frames.front();
321                         pending_video_frames.pop_front();
322                         lock.unlock();
323                         video_frame_allocator->release_frame(video_frame.frame);
324                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
325                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
326                                 audio_timecode);
327                         QueuedFrame audio_frame = pending_audio_frames.front();
328                         pending_audio_frames.pop_front();
329                         lock.unlock();
330                         audio_format.id = audio_frame.format;
331
332                         // Use the video format of the pending frame.
333                         QueuedFrame video_frame = pending_video_frames.front();
334                         VideoFormat video_format;
335                         decode_video_format(video_frame.format, &video_format);
336
337                         frame_callback(audio_timecode,
338                                        FrameAllocator::Frame(), 0, video_format,
339                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
340                 } else {
341                         QueuedFrame video_frame = pending_video_frames.front();
342                         QueuedFrame audio_frame = pending_audio_frames.front();
343                         pending_audio_frames.pop_front();
344                         pending_video_frames.pop_front();
345                         lock.unlock();
346
347 #if 0
348                         char filename[255];
349                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
350                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
351                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
352 #endif
353
354                         VideoFormat video_format;
355                         audio_format.id = audio_frame.format;
356                         if (decode_video_format(video_frame.format, &video_format)) {
357                                 frame_callback(video_timecode,
358                                                video_frame.frame, HEADER_SIZE, video_format,
359                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
360                         } else {
361                                 frame_callback(video_timecode,
362                                                FrameAllocator::Frame(), 0, video_format,
363                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
364                         }
365                 }
366         }
367         if (has_dequeue_callbacks) {
368                 dequeue_cleanup_callback();
369         }
370 }
371
372 void BMUSBCapture::start_new_frame(const uint8_t *start)
373 {
374         uint16_t format = (start[3] << 8) | start[2];
375         uint16_t timecode = (start[1] << 8) | start[0];
376
377         if (current_video_frame.len > 0) {
378                 current_video_frame.received_timestamp = steady_clock::now();
379
380                 // If format is 0x0800 (no signal), add a fake (empty) audio
381                 // frame to get it out of the queue.
382                 // TODO: Figure out if there are other formats that come with
383                 // no audio, and treat them the same.
384                 if (format == 0x0800) {
385                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
386                         if (fake_audio_frame.data == nullptr) {
387                                 // Oh well, it's just a no-signal frame anyway.
388                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
389                                 current_video_frame.owner->release_frame(current_video_frame);
390                                 current_video_frame = video_frame_allocator->alloc_frame();
391                                 return;
392                         }
393                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
394                 }
395                 //dump_frame();
396                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
397
398                 // Update the assumed frame width. We might be one frame too late on format changes,
399                 // but it's much better than asking the user to choose manually.
400                 VideoFormat video_format;
401                 if (decode_video_format(format, &video_format)) {
402                         assumed_frame_width = video_format.width;
403                 }
404         }
405         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
406         //      format, timecode,
407         //      //start[7], start[6], start[5], start[4],
408         //      read_current_frame, FRAME_SIZE);
409
410         current_video_frame = video_frame_allocator->alloc_frame();
411         //if (current_video_frame.data == nullptr) {
412         //      read_current_frame = -1;
413         //} else {
414         //      read_current_frame = 0;
415         //}
416 }
417
418 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
419 {
420         uint16_t format = (start[3] << 8) | start[2];
421         uint16_t timecode = (start[1] << 8) | start[0];
422         if (current_audio_frame.len > 0) {
423                 current_audio_frame.received_timestamp = steady_clock::now();
424                 //dump_audio_block();
425                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
426         }
427         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
428         //      format, timecode, read_current_audio_block);
429         current_audio_frame = audio_frame_allocator->alloc_frame();
430 }
431
432 #if 0
433 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
434 {
435         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
436         for (unsigned j = 0; j < pack->actual_length; j++) {
437         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
438                 printf("%02x", xfr->buffer[j + offset]);
439                 if ((j % 16) == 15)
440                         printf("\n");
441                 else if ((j % 8) == 7)
442                         printf("  ");
443                 else
444                         printf(" ");
445         }
446 }
447 #endif
448
449 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
450 {
451         assert(n % 2 == 0);
452         uint8_t *dptr1 = dest1;
453         uint8_t *dptr2 = dest2;
454
455         for (size_t i = 0; i < n; i += 2) {
456                 *dptr1++ = *src++;
457                 *dptr2++ = *src++;
458         }
459 }
460
461 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
462 {
463         if (current_frame->data == nullptr ||
464             current_frame->len > current_frame->size ||
465             start == end) {
466                 return;
467         }
468
469         int bytes = end - start;
470         if (current_frame->len + bytes > current_frame->size) {
471                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
472                 current_frame->len = current_frame->size;
473                 if (current_frame->overflow > 1048576) {
474                         printf("%d bytes overflow after last %s frame\n",
475                                 int(current_frame->overflow), frame_type_name);
476                         current_frame->overflow = 0;
477                 }
478                 //dump_frame();
479         } else {
480                 if (current_frame->interleaved) {
481                         uint8_t *data = current_frame->data + current_frame->len / 2;
482                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
483                         if (current_frame->len % 2 == 1) {
484                                 ++data;
485                                 swap(data, data2);
486                         }
487                         if (bytes % 2 == 1) {
488                                 *data++ = *start++;
489                                 swap(data, data2);
490                                 ++current_frame->len;
491                                 --bytes;
492                         }
493                         memcpy_interleaved(data, data2, start, bytes);
494                         current_frame->len += bytes;
495                 } else {
496                         memcpy(current_frame->data + current_frame->len, start, bytes);
497                         current_frame->len += bytes;
498                 }
499         }
500 }
501
502 #if 0
503 void avx2_dump(const char *name, __m256i n)
504 {
505         printf("%-10s:", name);
506         printf(" %02x", _mm256_extract_epi8(n, 0));
507         printf(" %02x", _mm256_extract_epi8(n, 1));
508         printf(" %02x", _mm256_extract_epi8(n, 2));
509         printf(" %02x", _mm256_extract_epi8(n, 3));
510         printf(" %02x", _mm256_extract_epi8(n, 4));
511         printf(" %02x", _mm256_extract_epi8(n, 5));
512         printf(" %02x", _mm256_extract_epi8(n, 6));
513         printf(" %02x", _mm256_extract_epi8(n, 7));
514         printf(" ");
515         printf(" %02x", _mm256_extract_epi8(n, 8));
516         printf(" %02x", _mm256_extract_epi8(n, 9));
517         printf(" %02x", _mm256_extract_epi8(n, 10));
518         printf(" %02x", _mm256_extract_epi8(n, 11));
519         printf(" %02x", _mm256_extract_epi8(n, 12));
520         printf(" %02x", _mm256_extract_epi8(n, 13));
521         printf(" %02x", _mm256_extract_epi8(n, 14));
522         printf(" %02x", _mm256_extract_epi8(n, 15));
523         printf(" ");
524         printf(" %02x", _mm256_extract_epi8(n, 16));
525         printf(" %02x", _mm256_extract_epi8(n, 17));
526         printf(" %02x", _mm256_extract_epi8(n, 18));
527         printf(" %02x", _mm256_extract_epi8(n, 19));
528         printf(" %02x", _mm256_extract_epi8(n, 20));
529         printf(" %02x", _mm256_extract_epi8(n, 21));
530         printf(" %02x", _mm256_extract_epi8(n, 22));
531         printf(" %02x", _mm256_extract_epi8(n, 23));
532         printf(" ");
533         printf(" %02x", _mm256_extract_epi8(n, 24));
534         printf(" %02x", _mm256_extract_epi8(n, 25));
535         printf(" %02x", _mm256_extract_epi8(n, 26));
536         printf(" %02x", _mm256_extract_epi8(n, 27));
537         printf(" %02x", _mm256_extract_epi8(n, 28));
538         printf(" %02x", _mm256_extract_epi8(n, 29));
539         printf(" %02x", _mm256_extract_epi8(n, 30));
540         printf(" %02x", _mm256_extract_epi8(n, 31));
541         printf("\n");
542 }
543 #endif
544
545 #ifndef HAS_MULTIVERSIONING
546
547 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
548 {
549         // No fast path possible unless we have multiversioning.
550         return start;
551 }
552
553 #else  // defined(HAS_MULTIVERSIONING)
554
555 __attribute__((target("sse4.1")))
556 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
557
558 __attribute__((target("avx2")))
559 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
560
561 // Does a memcpy and memchr in one to reduce processing time.
562 // Note that the benefit is somewhat limited if your L3 cache is small,
563 // as you'll (unfortunately) spend most of the time loading the data
564 // from main memory.
565 //
566 // Complicated cases are left to the slow path; it basically stops copying
567 // up until the first instance of "sync_char" (usually a bit before, actually).
568 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
569 // data, and what we really need this for is the 00 00 ff ff marker in video data.
570 __attribute__((target("default")))
571 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
572 {
573         // No fast path possible unless we have SSE 4.1 or higher.
574         return start;
575 }
576
577 __attribute__((target("sse4.1", "avx2")))
578 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
579 {
580         if (current_frame->data == nullptr ||
581             current_frame->len > current_frame->size ||
582             start == limit) {
583                 return start;
584         }
585         size_t orig_bytes = limit - start;
586         if (orig_bytes < 128) {
587                 // Don't bother.
588                 return start;
589         }
590
591         // Don't read more bytes than we can write.
592         limit = min(limit, start + (current_frame->size - current_frame->len));
593
594         // Align end to 32 bytes.
595         limit = (const uint8_t *)(intptr_t(limit) & ~31);
596
597         if (start >= limit) {
598                 return start;
599         }
600
601         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
602         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
603         if (aligned_start != start) {
604                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
605                 if (sync_start == nullptr) {
606                         add_to_frame(current_frame, "", start, aligned_start);
607                 } else {
608                         add_to_frame(current_frame, "", start, sync_start);
609                         return sync_start;
610                 }
611         }
612
613         // Make the length a multiple of 64.
614         if (current_frame->interleaved) {
615                 if (((limit - aligned_start) % 64) != 0) {
616                         limit -= 32;
617                 }
618                 assert(((limit - aligned_start) % 64) == 0);
619         }
620
621         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
622 }
623
624 __attribute__((target("avx2")))
625 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
626 {
627         const __m256i needle = _mm256_set1_epi8(sync_char);
628
629         const __restrict __m256i *in = (const __m256i *)aligned_start;
630         if (current_frame->interleaved) {
631                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
632                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
633                 if (current_frame->len % 2 == 1) {
634                         swap(out1, out2);
635                 }
636
637                 __m256i shuffle_cw = _mm256_set_epi8(
638                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
639                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
640                 while (in < (const __m256i *)limit) {
641                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
642                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
643                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
644
645                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
646                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
647                         __m256i found = _mm256_or_si256(found1, found2);
648
649                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
650                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
651                 
652                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
653                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
654
655                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
656                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
657
658                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
659                         _mm256_storeu_si256(out2, hi);
660
661                         if (!_mm256_testz_si256(found, found)) {
662                                 break;
663                         }
664
665                         in += 2;
666                         ++out1;
667                         ++out2;
668                 }
669                 current_frame->len += (uint8_t *)in - aligned_start;
670         } else {
671                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
672                 while (in < (const __m256i *)limit) {
673                         __m256i data = _mm256_load_si256(in);
674                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
675                         __m256i found = _mm256_cmpeq_epi8(data, needle);
676                         if (!_mm256_testz_si256(found, found)) {
677                                 break;
678                         }
679
680                         ++in;
681                         ++out;
682                 }
683                 current_frame->len = (uint8_t *)out - current_frame->data;
684         }
685
686         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
687         return (const uint8_t *)in;
688 }
689
690 __attribute__((target("sse4.1")))
691 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
692 {
693         const __m128i needle = _mm_set1_epi8(sync_char);
694
695         const __m128i *in = (const __m128i *)aligned_start;
696         if (current_frame->interleaved) {
697                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
698                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
699                 if (current_frame->len % 2 == 1) {
700                         swap(out1, out2);
701                 }
702
703                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
704                 while (in < (const __m128i *)limit) {
705                         __m128i data1 = _mm_load_si128(in);
706                         __m128i data2 = _mm_load_si128(in + 1);
707                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
708                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
709                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
710                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
711                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
712                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
713                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
714                         _mm_storeu_si128(out2, hi);
715                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
716                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
717                         if (!_mm_testz_si128(found1, found1) ||
718                             !_mm_testz_si128(found2, found2)) {
719                                 break;
720                         }
721
722                         in += 2;
723                         ++out1;
724                         ++out2;
725                 }
726                 current_frame->len += (uint8_t *)in - aligned_start;
727         } else {
728                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
729                 while (in < (const __m128i *)limit) {
730                         __m128i data = _mm_load_si128(in);
731                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
732                         __m128i found = _mm_cmpeq_epi8(data, needle);
733                         if (!_mm_testz_si128(found, found)) {
734                                 break;
735                         }
736
737                         ++in;
738                         ++out;
739                 }
740                 current_frame->len = (uint8_t *)out - current_frame->data;
741         }
742
743         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
744         return (const uint8_t *)in;
745 }
746
747 #endif  // defined(HAS_MULTIVERSIONING)
748
749 void decode_packs(const libusb_transfer *xfr,
750                   const char *sync_pattern,
751                   int sync_length,
752                   FrameAllocator::Frame *current_frame,
753                   const char *frame_type_name,
754                   function<void(const uint8_t *start)> start_callback)
755 {
756         int offset = 0;
757         for (int i = 0; i < xfr->num_iso_packets; i++) {
758                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
759
760                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
761                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
762                         continue;
763 //exit(5);
764                 }
765
766                 const uint8_t *start = xfr->buffer + offset;
767                 const uint8_t *limit = start + pack->actual_length;
768                 while (start < limit) {  // Usually runs only one iteration.
769                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
770                         if (start == limit) break;
771                         assert(start < limit);
772
773                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
774                         if (start_next_frame == nullptr) {
775                                 // add the rest of the buffer
776                                 add_to_frame(current_frame, frame_type_name, start, limit);
777                                 break;
778                         } else {
779                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
780                                 start = start_next_frame + sync_length;  // skip sync
781                                 start_callback(start);
782                         }
783                 }
784 #if 0
785                 dump_pack(xfr, offset, pack);
786 #endif
787                 offset += pack->length;
788         }
789 }
790
791 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
792 {
793         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
794             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
795                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
796                 libusb_free_transfer(xfr);
797                 exit(3);
798         }
799
800         assert(xfr->user_data != nullptr);
801         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
802
803         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
804                 if (!usb->disconnected) {
805                         fprintf(stderr, "Device went away, stopping transfers.\n");
806                         usb->disconnected = true;
807                         if (usb->card_disconnected_callback) {
808                                 usb->card_disconnected_callback();
809                         }
810                 }
811                 // Don't reschedule the transfer; the loop will stop by itself.
812                 return;
813         }
814
815         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
816                 if (xfr->endpoint == 0x84) {
817                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
818                 } else {
819                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
820
821                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
822                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
823                 }
824         }
825         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
826                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
827                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
828 #if 0
829                 if (setup->wIndex == 44) {
830                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
831                 } else {
832                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
833                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
834                 }
835 #else
836                 memcpy(usb->register_file + usb->current_register, buf, 4);
837                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
838                 if (usb->current_register == 0) {
839                         // read through all of them
840                         printf("register dump:");
841                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
842                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
843                         }
844                         printf("\n");
845                 }
846                 libusb_fill_control_setup(xfr->buffer,
847                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
848                         /*index=*/usb->current_register, /*length=*/4);
849 #endif
850         }
851
852 #if 0
853         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
854         for (i = 0; i < xfr->actual_length; i++) {
855                 printf("%02x", xfr->buffer[i]);
856                 if (i % 16)
857                         printf("\n");
858                 else if (i % 8)
859                         printf("  ");
860                 else
861                         printf(" ");
862         }
863 #endif
864
865         int rc = libusb_submit_transfer(xfr);
866         if (rc < 0) {
867                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
868                 exit(1);
869         }
870 }
871
872 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
873 {
874         if (card_connected_callback != nullptr) {
875                 libusb_device_descriptor desc;
876                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
877                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
878                         libusb_unref_device(dev);
879                         return 1;
880                 }
881
882                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
883                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
884                         card_connected_callback(dev);  // Callback takes ownership.
885                         return 0;
886                 }
887         }
888         libusb_unref_device(dev);
889         return 0;
890 }
891
892 void BMUSBCapture::usb_thread_func()
893 {
894         sched_param param;
895         memset(&param, 0, sizeof(param));
896         param.sched_priority = 1;
897         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
898                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
899         }
900         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
901         while (!should_quit) {
902                 timeval sec { 1, 0 };
903                 int rc = libusb_handle_events_timeout(nullptr, &sec);
904                 if (rc != LIBUSB_SUCCESS)
905                         break;
906         }
907 }
908
909 namespace {
910
911 struct USBCardDevice {
912         uint16_t product;
913         uint8_t bus, port;
914         libusb_device *device;
915 };
916
917 const char *get_product_name(uint16_t product)
918 {
919         if (product == 0xbd3b) {
920                 return "Intensity Shuttle";
921         } else if (product == 0xbd4f) {
922                 return "UltraStudio SDI";
923         } else {
924                 assert(false);
925                 return nullptr;
926         }
927 }
928
929 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
930 {
931         const char *product_name = get_product_name(product);
932
933         char buf[256];
934         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
935                 id, bus, port, product_name);
936         return buf;
937 }
938
939 vector<USBCardDevice> find_all_cards()
940 {
941         libusb_device **devices;
942         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
943         if (num_devices == -1) {
944                 fprintf(stderr, "Error finding USB devices\n");
945                 exit(1);
946         }
947         vector<USBCardDevice> found_cards;
948         for (ssize_t i = 0; i < num_devices; ++i) {
949                 libusb_device_descriptor desc;
950                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
951                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
952                         exit(1);
953                 }
954
955                 uint8_t bus = libusb_get_bus_number(devices[i]);
956                 uint8_t port = libusb_get_port_number(devices[i]);
957
958                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
959                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
960                         libusb_unref_device(devices[i]);
961                         continue;
962                 }
963
964                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
965         }
966         libusb_free_device_list(devices, 0);
967
968         // Sort the devices to get a consistent ordering.
969         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
970                 if (a.product != b.product)
971                         return a.product < b.product;
972                 if (a.bus != b.bus)
973                         return a.bus < b.bus;
974                 return a.port < b.port;
975         });
976
977         return found_cards;
978 }
979
980 libusb_device_handle *open_card(int card_index, string *description)
981 {
982         vector<USBCardDevice> found_cards = find_all_cards();
983
984         for (size_t i = 0; i < found_cards.size(); ++i) {
985                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
986                 fprintf(stderr, "%s\n", tmp_description.c_str());
987                 if (i == size_t(card_index)) {
988                         *description = tmp_description;
989                 }
990         }
991
992         if (size_t(card_index) >= found_cards.size()) {
993                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
994                 exit(1);
995         }
996
997         libusb_device_handle *devh;
998         int rc = libusb_open(found_cards[card_index].device, &devh);
999         if (rc < 0) {
1000                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
1001                 exit(1);
1002         }
1003
1004         for (size_t i = 0; i < found_cards.size(); ++i) {
1005                 libusb_unref_device(found_cards[i].device);
1006         }
1007
1008         return devh;
1009 }
1010
1011 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1012 {
1013         uint8_t bus = libusb_get_bus_number(dev);
1014         uint8_t port = libusb_get_port_number(dev);
1015
1016         libusb_device_descriptor desc;
1017         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1018                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1019                 exit(1);
1020         }
1021
1022         *description = get_card_description(card_index, bus, port, desc.idProduct);
1023
1024         libusb_device_handle *devh;
1025         int rc = libusb_open(dev, &devh);
1026         if (rc < 0) {
1027                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1028                 exit(1);
1029         }
1030
1031         return devh;
1032 }
1033
1034 }  // namespace
1035
1036 unsigned BMUSBCapture::num_cards()
1037 {
1038         int rc = libusb_init(nullptr);
1039         if (rc < 0) {
1040                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1041                 exit(1);
1042         }
1043
1044         vector<USBCardDevice> found_cards = find_all_cards();
1045         unsigned ret = found_cards.size();
1046         for (size_t i = 0; i < found_cards.size(); ++i) {
1047                 libusb_unref_device(found_cards[i].device);
1048         }
1049         return ret;
1050 }
1051
1052 void BMUSBCapture::configure_card()
1053 {
1054         if (video_frame_allocator == nullptr) {
1055                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1056                 set_video_frame_allocator(owned_video_frame_allocator.get());
1057         }
1058         if (audio_frame_allocator == nullptr) {
1059                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1060                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1061         }
1062         dequeue_thread_should_quit = false;
1063         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1064
1065         int rc;
1066         struct libusb_transfer *xfr;
1067
1068         rc = libusb_init(nullptr);
1069         if (rc < 0) {
1070                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1071                 exit(1);
1072         }
1073
1074         if (dev == nullptr) {
1075                 devh = open_card(card_index, &description);
1076         } else {
1077                 devh = open_card(card_index, dev, &description);
1078                 libusb_unref_device(dev);
1079         }
1080         if (!devh) {
1081                 fprintf(stderr, "Error finding USB device\n");
1082                 exit(1);
1083         }
1084
1085         libusb_config_descriptor *config;
1086         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1087         if (rc < 0) {
1088                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1089                 exit(1);
1090         }
1091
1092 #if 0
1093         printf("%d interface\n", config->bNumInterfaces);
1094         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1095                 printf("  interface %d\n", interface_number);
1096                 const libusb_interface *interface = &config->interface[interface_number];
1097                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1098                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1099                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1100                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1101                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1102                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1103                         }
1104                 }
1105         }
1106 #endif
1107
1108         rc = libusb_set_configuration(devh, /*configuration=*/1);
1109         if (rc < 0) {
1110                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1111                 exit(1);
1112         }
1113
1114         rc = libusb_claim_interface(devh, 0);
1115         if (rc < 0) {
1116                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1117                 exit(1);
1118         }
1119
1120         // Alternate setting 1 is output, alternate setting 2 is input.
1121         // Card is reset when switching alternates, so the driver uses
1122         // this “double switch” when it wants to reset.
1123         //
1124         // There's also alternate settings 3 and 4, which seem to be
1125         // like 1 and 2 except they advertise less bandwidth needed.
1126         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1127         if (rc < 0) {
1128                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1129                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1130                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1131                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1132                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1133                 }
1134                 exit(1);
1135         }
1136         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1137         if (rc < 0) {
1138                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1139                 exit(1);
1140         }
1141 #if 0
1142         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1143         if (rc < 0) {
1144                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1145                 exit(1);
1146         }
1147 #endif
1148
1149 #if 0
1150         rc = libusb_claim_interface(devh, 3);
1151         if (rc < 0) {
1152                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1153                 exit(1);
1154         }
1155 #endif
1156
1157         // theories:
1158         //   44 is some kind of timer register (first 16 bits count upwards)
1159         //   24 is some sort of watchdog?
1160         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1161         //      (or will go to 0x73c60010?), also seen 0x73c60100
1162         //   12 also changes all the time, unclear why  
1163         //   16 seems to be autodetected mode somehow
1164         //      --    this is e00115e0 after reset?
1165         //                    ed0115e0 after mode change [to output?]
1166         //                    2d0015e0 after more mode change [to input]
1167         //                    ed0115e0 after more mode change
1168         //                    2d0015e0 after more mode change
1169         //
1170         //                    390115e0 seems to indicate we have signal
1171         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1172         //
1173         //                    200015e0 on startup
1174         //         changes to 250115e0 when we sync to the signal
1175         //
1176         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1177         //
1178         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1179         //
1180         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1181         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1182         //
1183         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1184         //    perhaps some of them are related to analog output?
1185         //
1186         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1187         //    but the driver sets it to 0x8036802a at some point.
1188         //
1189         //    all of this is on request 214/215. other requests (192, 219,
1190         //    222, 223, 224) are used for firmware upgrade. Probably best to
1191         //    stay out of it unless you know what you're doing.
1192         //
1193         //
1194         // register 16:
1195         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1196         //
1197         // theories:
1198         //   0x01 - stable signal
1199         //   0x04 - deep color
1200         //   0x08 - unknown (audio??)
1201         //   0x20 - 720p??
1202         //   0x30 - 576p??
1203
1204         update_capture_mode();
1205
1206         struct ctrl {
1207                 int endpoint;
1208                 int request;
1209                 int index;
1210                 uint32_t data;
1211         };
1212         static const ctrl ctrls[] = {
1213                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1214                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1215
1216                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1217                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1218                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1219                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1220         };
1221
1222         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1223                 uint32_t flipped = htonl(ctrls[req].data);
1224                 static uint8_t value[4];
1225                 memcpy(value, &flipped, sizeof(flipped));
1226                 int size = sizeof(value);
1227                 //if (ctrls[req].request == 215) size = 0;
1228                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1229                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1230                 if (rc < 0) {
1231                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1232                         exit(1);
1233                 }
1234
1235                 if (ctrls[req].index == 16 && rc == 4) {
1236                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1237                 }
1238
1239 #if 0
1240                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1241                 for (int i = 0; i < rc; ++i) {
1242                         printf("%02x", value[i]);
1243                 }
1244                 printf("\n");
1245 #endif
1246         }
1247
1248 #if 0
1249         // DEBUG
1250         for ( ;; ) {
1251                 static int my_index = 0;
1252                 static uint8_t value[4];
1253                 int size = sizeof(value);
1254                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1255                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1256                 if (rc < 0) {
1257                         fprintf(stderr, "Error on control\n");
1258                         exit(1);
1259                 }
1260                 printf("rc=%d index=%d: 0x", rc, my_index);
1261                 for (int i = 0; i < rc; ++i) {
1262                         printf("%02x", value[i]);
1263                 }
1264                 printf("\n");
1265         }
1266 #endif
1267
1268 #if 0
1269         // set up an asynchronous transfer of the timer register
1270         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1271         static int completed = 0;
1272
1273         xfr = libusb_alloc_transfer(0);
1274         libusb_fill_control_setup(cmdbuf,
1275             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1276                 /*index=*/44, /*length=*/4);
1277         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1278         xfr->user_data = this;
1279         libusb_submit_transfer(xfr);
1280
1281         // set up an asynchronous transfer of register 24
1282         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1283         static int completed2 = 0;
1284
1285         xfr = libusb_alloc_transfer(0);
1286         libusb_fill_control_setup(cmdbuf2,
1287             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1288                 /*index=*/24, /*length=*/4);
1289         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1290         xfr->user_data = this;
1291         libusb_submit_transfer(xfr);
1292 #endif
1293
1294         // set up an asynchronous transfer of the register dump
1295         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1296         static int completed3 = 0;
1297
1298         xfr = libusb_alloc_transfer(0);
1299         libusb_fill_control_setup(cmdbuf3,
1300             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1301                 /*index=*/current_register, /*length=*/4);
1302         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1303         xfr->user_data = this;
1304         //libusb_submit_transfer(xfr);
1305
1306         //audiofp = fopen("audio.raw", "wb");
1307
1308         // set up isochronous transfers for audio and video
1309         for (int e = 3; e <= 4; ++e) {
1310                 //int num_transfers = (e == 3) ? 6 : 6;
1311                 int num_transfers = 6;
1312                 for (int i = 0; i < num_transfers; ++i) {
1313                         size_t buf_size;
1314                         int num_iso_pack, size;
1315                         if (e == 3) {
1316                                 // Allocate for minimum width (because that will give us the most
1317                                 // number of packets, so we don't need to reallocated, but we'll
1318                                 // default to 720p for the first frame.
1319                                 size = find_xfer_size_for_width(MIN_WIDTH);
1320                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1321                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1322                         } else {
1323                                 size = 0xc0;
1324                                 num_iso_pack = 80;
1325                                 buf_size = num_iso_pack * size;
1326                         }
1327                         int num_bytes = num_iso_pack * size;
1328                         assert(size_t(num_bytes) <= buf_size);
1329 #if LIBUSB_API_VERSION >= 0x01000105
1330                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1331 #else
1332                         uint8_t *buf = nullptr;
1333 #endif
1334                         if (buf == nullptr) {
1335                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1336 #if LIBUSB_API_VERSION >= 0x01000105
1337                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1338 #else
1339                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1340 #endif
1341                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1342                                 buf = new uint8_t[num_bytes];
1343                         }
1344
1345                         xfr = libusb_alloc_transfer(num_iso_pack);
1346                         if (!xfr) {
1347                                 fprintf(stderr, "oom\n");
1348                                 exit(1);
1349                         }
1350
1351                         int ep = LIBUSB_ENDPOINT_IN | e;
1352                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1353                                 num_iso_pack, cb_xfr, nullptr, 0);
1354                         libusb_set_iso_packet_lengths(xfr, size);
1355                         xfr->user_data = this;
1356
1357                         if (e == 3) {
1358                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1359                         }
1360
1361                         iso_xfrs.push_back(xfr);
1362                 }
1363         }
1364 }
1365
1366 void BMUSBCapture::start_bm_capture()
1367 {
1368         int i = 0;
1369         for (libusb_transfer *xfr : iso_xfrs) {
1370                 int rc = libusb_submit_transfer(xfr);
1371                 ++i;
1372                 if (rc < 0) {
1373                         //printf("num_bytes=%d\n", num_bytes);
1374                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1375                                 xfr->endpoint, i, libusb_error_name(rc));
1376                         exit(1);
1377                 }
1378         }
1379
1380
1381 #if 0
1382         libusb_release_interface(devh, 0);
1383 out:
1384         if (devh)
1385                 libusb_close(devh);
1386         libusb_exit(nullptr);
1387         return rc;
1388 #endif
1389 }
1390
1391 void BMUSBCapture::stop_dequeue_thread()
1392 {
1393         dequeue_thread_should_quit = true;
1394         queues_not_empty.notify_all();
1395         dequeue_thread.join();
1396 }
1397
1398 void BMUSBCapture::start_bm_thread()
1399 {
1400         // Devices leaving are discovered by seeing the isochronous packets
1401         // coming back with errors, so only care about devices joining.
1402         if (card_connected_callback != nullptr) {
1403                 if (libusb_hotplug_register_callback(
1404                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1405                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1406                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1407                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1408                         exit(1);
1409                 }
1410         }
1411
1412         should_quit = false;
1413         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1414 }
1415
1416 void BMUSBCapture::stop_bm_thread()
1417 {
1418         should_quit = true;
1419         usb_thread.join();
1420 }
1421
1422 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1423 {
1424         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1425         VideoMode auto_mode;
1426         auto_mode.name = "Autodetect";
1427         auto_mode.autodetect = true;
1428         return {{ 0, auto_mode }};
1429 }
1430
1431 uint32_t BMUSBCapture::get_current_video_mode() const
1432 {
1433         return 0;  // Matches get_available_video_modes().
1434 }
1435
1436 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1437 {
1438         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1439 }
1440
1441 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1442 {
1443         return {
1444                 { 0x00000000, "HDMI/SDI" },
1445                 { 0x02000000, "Component" },
1446                 { 0x04000000, "Composite" },
1447                 { 0x06000000, "S-video" }
1448         };
1449 }
1450
1451 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1452 {
1453         assert((video_input_id & ~0x06000000) == 0);
1454         current_video_input = video_input_id;
1455         update_capture_mode();
1456 }
1457
1458 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1459 {
1460         return {
1461                 { 0x00000000, "Embedded" },
1462                 { 0x10000000, "Analog" }
1463         };
1464 }
1465
1466 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1467 {
1468         assert((audio_input_id & ~0x10000000) == 0);
1469         current_audio_input = audio_input_id;
1470         update_capture_mode();
1471 }
1472
1473 void BMUSBCapture::update_capture_mode()
1474 {
1475         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1476         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1477         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1478
1479         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1480                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1481         if (rc < 0) {
1482                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1483                 exit(1);
1484         }
1485 }
1486
1487 }  // namespace bmusb