]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Release v0.5.2.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.5.2
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <sched.h>
17 #include <stdint.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #if HAS_MULTIVERSIONING
22 #include <immintrin.h>
23 #endif
24 #include "bmusb/bmusb.h"
25
26 #include <algorithm>
27 #include <atomic>
28 #include <condition_variable>
29 #include <cstddef>
30 #include <cstdint>
31 #include <deque>
32 #include <functional>
33 #include <memory>
34 #include <mutex>
35 #include <stack>
36 #include <string>
37 #include <thread>
38
39 using namespace std;
40 using namespace std::placeholders;
41
42 #define USB_VENDOR_BLACKMAGIC 0x1edb
43 #define MIN_WIDTH 640
44 #define HEADER_SIZE 44
45 //#define HEADER_SIZE 0
46 #define AUDIO_HEADER_SIZE 4
47
48 #define FRAME_SIZE (8 << 20)  // 8 MB.
49 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
50
51 namespace bmusb {
52
53 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
54 bool BMUSBCapture::hotplug_existing_devices = false;
55
56 namespace {
57
58 FILE *audiofp;
59
60 thread usb_thread;
61 atomic<bool> should_quit;
62
63 int find_xfer_size_for_width(int width)
64 {
65         // Video seems to require isochronous packets scaled with the width;
66         // seemingly six lines is about right, rounded up to the required 1kB
67         // multiple.
68         int size = width * 2 * 6;
69         // Note that for 10-bit input, you'll need to increase size accordingly.
70         //size = size * 4 / 3;
71         if (size % 1024 != 0) {
72                 size &= ~1023;
73                 size += 1024;
74         }
75         return size;
76 }
77
78 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
79 {
80         assert(width >= MIN_WIDTH);
81         size_t size = find_xfer_size_for_width(width);
82         int num_iso_pack = xfr->length / size;
83         if (num_iso_pack != xfr->num_iso_packets ||
84             size != xfr->iso_packet_desc[0].length) {
85                 xfr->num_iso_packets = num_iso_pack;
86                 libusb_set_iso_packet_lengths(xfr, size);
87         }
88 }
89
90 struct VideoFormatEntry {
91         uint16_t normalized_video_format;
92         unsigned width, height, second_field_start;
93         unsigned extra_lines_top, extra_lines_bottom;
94         unsigned frame_rate_nom, frame_rate_den;
95         bool interlaced;
96 };
97
98 // Get details for the given video format; returns false if detection was incomplete.
99 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
100 {
101         decoded_video_format->id = video_format;
102         decoded_video_format->interlaced = false;
103
104         // TODO: Add these for all formats as we find them.
105         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
106
107         if (video_format == 0x0800) {
108                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
109                 // It's a strange thing, but what can you do.
110                 decoded_video_format->width = 720;
111                 decoded_video_format->height = 525;
112                 decoded_video_format->extra_lines_top = 0;
113                 decoded_video_format->extra_lines_bottom = 0;
114                 decoded_video_format->frame_rate_nom = 3013;
115                 decoded_video_format->frame_rate_den = 100;
116                 decoded_video_format->has_signal = false;
117                 return true;
118         }
119         if ((video_format & 0xe800) != 0xe800) {
120                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
121                         video_format);
122                 decoded_video_format->width = 0;
123                 decoded_video_format->height = 0;
124                 decoded_video_format->extra_lines_top = 0;
125                 decoded_video_format->extra_lines_bottom = 0;
126                 decoded_video_format->frame_rate_nom = 60;
127                 decoded_video_format->frame_rate_den = 1;
128                 decoded_video_format->has_signal = false;
129                 return false;
130         }
131
132         decoded_video_format->has_signal = true;
133
134         // NTSC (480i59.94, I suppose). A special case, see below.
135         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
136                 decoded_video_format->width = 720;
137                 decoded_video_format->height = 480;
138                 decoded_video_format->extra_lines_top = 17;
139                 decoded_video_format->extra_lines_bottom = 28;
140                 decoded_video_format->frame_rate_nom = 30000;
141                 decoded_video_format->frame_rate_den = 1001;
142                 decoded_video_format->second_field_start = 280;
143                 decoded_video_format->interlaced = true;
144                 return true;
145         }
146
147         // PAL (576i50, I suppose). A special case, see below.
148         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
149                 decoded_video_format->width = 720;
150                 decoded_video_format->height = 576;
151                 decoded_video_format->extra_lines_top = 22;
152                 decoded_video_format->extra_lines_bottom = 27;
153                 decoded_video_format->frame_rate_nom = 25;
154                 decoded_video_format->frame_rate_den = 1;
155                 decoded_video_format->second_field_start = 335;
156                 decoded_video_format->interlaced = true;
157                 return true;
158         }
159
160         // 0x8 seems to be a flag about availability of deep color on the input,
161         // except when it's not (e.g. it's the only difference between NTSC
162         // and PAL). Rather confusing. But we clear it here nevertheless, because
163         // usually it doesn't mean anything.
164         //
165         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
166         uint16_t normalized_video_format = video_format & ~0xe80c;
167         constexpr VideoFormatEntry entries[] = {
168                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
169                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
170                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
171                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
172                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
173                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
174                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
175                 { 0x01c3, 1920, 1080,   0,  0,  0,    30,    1, false },  // 1080p30.
176                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
177                 { 0x01e1, 1920, 1080,   0,  0,  0, 30000, 1001, false },  // 1080p29.97.
178                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
179                 { 0x0063, 1920, 1080,   0,  0,  0,    25,    1, false },  // 1080p25.
180                 { 0x0043, 1920, 1080,   0,  0,  0,    25,    1,  true },  // 1080p50.
181                 { 0x008e, 1920, 1080,   0,  0,  0,    24,    1, false },  // 1080p24.
182                 { 0x00a1, 1920, 1080,   0,  0,  0, 24000, 1001, false },  // 1080p23.98.
183         };
184         for (const VideoFormatEntry &entry : entries) {
185                 if (normalized_video_format == entry.normalized_video_format) {
186                         decoded_video_format->width = entry.width;
187                         decoded_video_format->height = entry.height;
188                         decoded_video_format->second_field_start = entry.second_field_start;
189                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
190                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
191                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
192                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
193                         decoded_video_format->interlaced = entry.interlaced;
194                         return true;
195                 }
196         }
197
198         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
199         decoded_video_format->width = 1280;
200         decoded_video_format->height = 720;
201         decoded_video_format->frame_rate_nom = 60;
202         decoded_video_format->frame_rate_den = 1;
203         return false;
204 }
205
206 }  // namespace
207
208 FrameAllocator::~FrameAllocator() {}
209
210 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
211         : frame_size(frame_size)
212 {
213         for (size_t i = 0; i < num_queued_frames; ++i) {
214                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
215         }
216 }
217
218 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
219 {
220         Frame vf;
221         vf.owner = this;
222
223         unique_lock<mutex> lock(freelist_mutex);  // Meh.
224         if (freelist.empty()) {
225                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
226                         frame_size);
227         } else {
228                 vf.data = freelist.top().release();
229                 vf.size = frame_size;
230                 freelist.pop();  // Meh.
231         }
232         return vf;
233 }
234
235 void MallocFrameAllocator::release_frame(Frame frame)
236 {
237         if (frame.overflow > 0) {
238                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
239         }
240         unique_lock<mutex> lock(freelist_mutex);
241         freelist.push(unique_ptr<uint8_t[]>(frame.data));
242 }
243
244 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
245 {
246         if (a == b) {
247                 return false;
248         } else if (a < b) {
249                 return (b - a < 0x8000);
250         } else {
251                 int wrap_b = 0x10000 + int(b);
252                 return (wrap_b - a < 0x8000);
253         }
254 }
255
256 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
257 {
258         unique_lock<mutex> lock(queue_lock);
259         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
260                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
261                         q->back().timecode, timecode);
262                 frame.owner->release_frame(frame);
263                 return;
264         }
265
266         QueuedFrame qf;
267         qf.format = format;
268         qf.timecode = timecode;
269         qf.frame = frame;
270         q->push_back(move(qf));
271         queues_not_empty.notify_one();  // might be spurious
272 }
273
274 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
275 {
276         FILE *fp = fopen(filename, "wb");
277         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
278                 printf("short write!\n");
279         }
280         fclose(fp);
281 }
282
283 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
284 {
285         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
286 }
287
288 void BMUSBCapture::dequeue_thread_func()
289 {
290         if (has_dequeue_callbacks) {
291                 dequeue_init_callback();
292         }
293         while (!dequeue_thread_should_quit) {
294                 unique_lock<mutex> lock(queue_lock);
295                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
296
297                 if (dequeue_thread_should_quit) break;
298
299                 uint16_t video_timecode = pending_video_frames.front().timecode;
300                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
301                 AudioFormat audio_format;
302                 audio_format.bits_per_sample = 24;
303                 audio_format.num_channels = 8;
304                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
305                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
306                                 video_timecode);
307                         QueuedFrame video_frame = pending_video_frames.front();
308                         pending_video_frames.pop_front();
309                         lock.unlock();
310                         video_frame_allocator->release_frame(video_frame.frame);
311                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
312                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
313                                 audio_timecode);
314                         QueuedFrame audio_frame = pending_audio_frames.front();
315                         pending_audio_frames.pop_front();
316                         lock.unlock();
317                         audio_format.id = audio_frame.format;
318
319                         // Use the video format of the pending frame.
320                         QueuedFrame video_frame = pending_video_frames.front();
321                         VideoFormat video_format;
322                         decode_video_format(video_frame.format, &video_format);
323
324                         frame_callback(audio_timecode,
325                                        FrameAllocator::Frame(), 0, video_format,
326                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
327                 } else {
328                         QueuedFrame video_frame = pending_video_frames.front();
329                         QueuedFrame audio_frame = pending_audio_frames.front();
330                         pending_audio_frames.pop_front();
331                         pending_video_frames.pop_front();
332                         lock.unlock();
333
334 #if 0
335                         char filename[255];
336                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
337                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
338                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
339 #endif
340
341                         VideoFormat video_format;
342                         audio_format.id = audio_frame.format;
343                         if (decode_video_format(video_frame.format, &video_format)) {
344                                 frame_callback(video_timecode,
345                                                video_frame.frame, HEADER_SIZE, video_format,
346                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
347                         } else {
348                                 frame_callback(video_timecode,
349                                                FrameAllocator::Frame(), 0, video_format,
350                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
351                         }
352                 }
353         }
354         if (has_dequeue_callbacks) {
355                 dequeue_cleanup_callback();
356         }
357 }
358
359 void BMUSBCapture::start_new_frame(const uint8_t *start)
360 {
361         uint16_t format = (start[3] << 8) | start[2];
362         uint16_t timecode = (start[1] << 8) | start[0];
363
364         if (current_video_frame.len > 0) {
365                 // If format is 0x0800 (no signal), add a fake (empty) audio
366                 // frame to get it out of the queue.
367                 // TODO: Figure out if there are other formats that come with
368                 // no audio, and treat them the same.
369                 if (format == 0x0800) {
370                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
371                         if (fake_audio_frame.data == nullptr) {
372                                 // Oh well, it's just a no-signal frame anyway.
373                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
374                                 current_video_frame.owner->release_frame(current_video_frame);
375                                 current_video_frame = video_frame_allocator->alloc_frame();
376                                 return;
377                         }
378                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
379                 }
380                 //dump_frame();
381                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
382
383                 // Update the assumed frame width. We might be one frame too late on format changes,
384                 // but it's much better than asking the user to choose manually.
385                 VideoFormat video_format;
386                 if (decode_video_format(format, &video_format)) {
387                         assumed_frame_width = video_format.width;
388                 }
389         }
390         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
391         //      format, timecode,
392         //      //start[7], start[6], start[5], start[4],
393         //      read_current_frame, FRAME_SIZE);
394
395         current_video_frame = video_frame_allocator->alloc_frame();
396         //if (current_video_frame.data == nullptr) {
397         //      read_current_frame = -1;
398         //} else {
399         //      read_current_frame = 0;
400         //}
401 }
402
403 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
404 {
405         uint16_t format = (start[3] << 8) | start[2];
406         uint16_t timecode = (start[1] << 8) | start[0];
407         if (current_audio_frame.len > 0) {
408                 //dump_audio_block();
409                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
410         }
411         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
412         //      format, timecode, read_current_audio_block);
413         current_audio_frame = audio_frame_allocator->alloc_frame();
414 }
415
416 #if 0
417 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
418 {
419         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
420         for (unsigned j = 0; j < pack->actual_length; j++) {
421         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
422                 printf("%02x", xfr->buffer[j + offset]);
423                 if ((j % 16) == 15)
424                         printf("\n");
425                 else if ((j % 8) == 7)
426                         printf("  ");
427                 else
428                         printf(" ");
429         }
430 }
431 #endif
432
433 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
434 {
435         assert(n % 2 == 0);
436         uint8_t *dptr1 = dest1;
437         uint8_t *dptr2 = dest2;
438
439         for (size_t i = 0; i < n; i += 2) {
440                 *dptr1++ = *src++;
441                 *dptr2++ = *src++;
442         }
443 }
444
445 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
446 {
447         if (current_frame->data == nullptr ||
448             current_frame->len > current_frame->size ||
449             start == end) {
450                 return;
451         }
452
453         int bytes = end - start;
454         if (current_frame->len + bytes > current_frame->size) {
455                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
456                 current_frame->len = current_frame->size;
457                 if (current_frame->overflow > 1048576) {
458                         printf("%d bytes overflow after last %s frame\n",
459                                 int(current_frame->overflow), frame_type_name);
460                         current_frame->overflow = 0;
461                 }
462                 //dump_frame();
463         } else {
464                 if (current_frame->interleaved) {
465                         uint8_t *data = current_frame->data + current_frame->len / 2;
466                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
467                         if (current_frame->len % 2 == 1) {
468                                 ++data;
469                                 swap(data, data2);
470                         }
471                         if (bytes % 2 == 1) {
472                                 *data++ = *start++;
473                                 swap(data, data2);
474                                 ++current_frame->len;
475                                 --bytes;
476                         }
477                         memcpy_interleaved(data, data2, start, bytes);
478                         current_frame->len += bytes;
479                 } else {
480                         memcpy(current_frame->data + current_frame->len, start, bytes);
481                         current_frame->len += bytes;
482                 }
483         }
484 }
485
486 #if 0
487 void avx2_dump(const char *name, __m256i n)
488 {
489         printf("%-10s:", name);
490         printf(" %02x", _mm256_extract_epi8(n, 0));
491         printf(" %02x", _mm256_extract_epi8(n, 1));
492         printf(" %02x", _mm256_extract_epi8(n, 2));
493         printf(" %02x", _mm256_extract_epi8(n, 3));
494         printf(" %02x", _mm256_extract_epi8(n, 4));
495         printf(" %02x", _mm256_extract_epi8(n, 5));
496         printf(" %02x", _mm256_extract_epi8(n, 6));
497         printf(" %02x", _mm256_extract_epi8(n, 7));
498         printf(" ");
499         printf(" %02x", _mm256_extract_epi8(n, 8));
500         printf(" %02x", _mm256_extract_epi8(n, 9));
501         printf(" %02x", _mm256_extract_epi8(n, 10));
502         printf(" %02x", _mm256_extract_epi8(n, 11));
503         printf(" %02x", _mm256_extract_epi8(n, 12));
504         printf(" %02x", _mm256_extract_epi8(n, 13));
505         printf(" %02x", _mm256_extract_epi8(n, 14));
506         printf(" %02x", _mm256_extract_epi8(n, 15));
507         printf(" ");
508         printf(" %02x", _mm256_extract_epi8(n, 16));
509         printf(" %02x", _mm256_extract_epi8(n, 17));
510         printf(" %02x", _mm256_extract_epi8(n, 18));
511         printf(" %02x", _mm256_extract_epi8(n, 19));
512         printf(" %02x", _mm256_extract_epi8(n, 20));
513         printf(" %02x", _mm256_extract_epi8(n, 21));
514         printf(" %02x", _mm256_extract_epi8(n, 22));
515         printf(" %02x", _mm256_extract_epi8(n, 23));
516         printf(" ");
517         printf(" %02x", _mm256_extract_epi8(n, 24));
518         printf(" %02x", _mm256_extract_epi8(n, 25));
519         printf(" %02x", _mm256_extract_epi8(n, 26));
520         printf(" %02x", _mm256_extract_epi8(n, 27));
521         printf(" %02x", _mm256_extract_epi8(n, 28));
522         printf(" %02x", _mm256_extract_epi8(n, 29));
523         printf(" %02x", _mm256_extract_epi8(n, 30));
524         printf(" %02x", _mm256_extract_epi8(n, 31));
525         printf("\n");
526 }
527 #endif
528
529 #ifndef HAS_MULTIVERSIONING
530
531 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
532 {
533         // No fast path possible unless we have multiversioning.
534         return start;
535 }
536
537 #else  // defined(HAS_MULTIVERSIONING)
538
539 __attribute__((target("sse4.1")))
540 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
541
542 __attribute__((target("avx2")))
543 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
544
545 // Does a memcpy and memchr in one to reduce processing time.
546 // Note that the benefit is somewhat limited if your L3 cache is small,
547 // as you'll (unfortunately) spend most of the time loading the data
548 // from main memory.
549 //
550 // Complicated cases are left to the slow path; it basically stops copying
551 // up until the first instance of "sync_char" (usually a bit before, actually).
552 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
553 // data, and what we really need this for is the 00 00 ff ff marker in video data.
554 __attribute__((target("default")))
555 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
556 {
557         // No fast path possible unless we have SSE 4.1 or higher.
558         return start;
559 }
560
561 __attribute__((target("sse4.1", "avx2")))
562 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
563 {
564         if (current_frame->data == nullptr ||
565             current_frame->len > current_frame->size ||
566             start == limit) {
567                 return start;
568         }
569         size_t orig_bytes = limit - start;
570         if (orig_bytes < 128) {
571                 // Don't bother.
572                 return start;
573         }
574
575         // Don't read more bytes than we can write.
576         limit = min(limit, start + (current_frame->size - current_frame->len));
577
578         // Align end to 32 bytes.
579         limit = (const uint8_t *)(intptr_t(limit) & ~31);
580
581         if (start >= limit) {
582                 return start;
583         }
584
585         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
586         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
587         if (aligned_start != start) {
588                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
589                 if (sync_start == nullptr) {
590                         add_to_frame(current_frame, "", start, aligned_start);
591                 } else {
592                         add_to_frame(current_frame, "", start, sync_start);
593                         return sync_start;
594                 }
595         }
596
597         // Make the length a multiple of 64.
598         if (current_frame->interleaved) {
599                 if (((limit - aligned_start) % 64) != 0) {
600                         limit -= 32;
601                 }
602                 assert(((limit - aligned_start) % 64) == 0);
603         }
604
605         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
606 }
607
608 __attribute__((target("avx2")))
609 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
610 {
611         const __m256i needle = _mm256_set1_epi8(sync_char);
612
613         const __restrict __m256i *in = (const __m256i *)aligned_start;
614         if (current_frame->interleaved) {
615                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
616                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
617                 if (current_frame->len % 2 == 1) {
618                         swap(out1, out2);
619                 }
620
621                 __m256i shuffle_cw = _mm256_set_epi8(
622                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
623                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
624                 while (in < (const __m256i *)limit) {
625                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
626                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
627                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
628
629                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
630                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
631                         __m256i found = _mm256_or_si256(found1, found2);
632
633                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
634                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
635                 
636                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
637                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
638
639                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
640                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
641
642                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
643                         _mm256_storeu_si256(out2, hi);
644
645                         if (!_mm256_testz_si256(found, found)) {
646                                 break;
647                         }
648
649                         in += 2;
650                         ++out1;
651                         ++out2;
652                 }
653                 current_frame->len += (uint8_t *)in - aligned_start;
654         } else {
655                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
656                 while (in < (const __m256i *)limit) {
657                         __m256i data = _mm256_load_si256(in);
658                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
659                         __m256i found = _mm256_cmpeq_epi8(data, needle);
660                         if (!_mm256_testz_si256(found, found)) {
661                                 break;
662                         }
663
664                         ++in;
665                         ++out;
666                 }
667                 current_frame->len = (uint8_t *)out - current_frame->data;
668         }
669
670         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
671         return (const uint8_t *)in;
672 }
673
674 __attribute__((target("sse4.1")))
675 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
676 {
677         const __m128i needle = _mm_set1_epi8(sync_char);
678
679         const __m128i *in = (const __m128i *)aligned_start;
680         if (current_frame->interleaved) {
681                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
682                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
683                 if (current_frame->len % 2 == 1) {
684                         swap(out1, out2);
685                 }
686
687                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
688                 while (in < (const __m128i *)limit) {
689                         __m128i data1 = _mm_load_si128(in);
690                         __m128i data2 = _mm_load_si128(in + 1);
691                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
692                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
693                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
694                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
695                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
696                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
697                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
698                         _mm_storeu_si128(out2, hi);
699                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
700                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
701                         if (!_mm_testz_si128(found1, found1) ||
702                             !_mm_testz_si128(found2, found2)) {
703                                 break;
704                         }
705
706                         in += 2;
707                         ++out1;
708                         ++out2;
709                 }
710                 current_frame->len += (uint8_t *)in - aligned_start;
711         } else {
712                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
713                 while (in < (const __m128i *)limit) {
714                         __m128i data = _mm_load_si128(in);
715                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
716                         __m128i found = _mm_cmpeq_epi8(data, needle);
717                         if (!_mm_testz_si128(found, found)) {
718                                 break;
719                         }
720
721                         ++in;
722                         ++out;
723                 }
724                 current_frame->len = (uint8_t *)out - current_frame->data;
725         }
726
727         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
728         return (const uint8_t *)in;
729 }
730
731 #endif  // defined(HAS_MULTIVERSIONING)
732
733 void decode_packs(const libusb_transfer *xfr,
734                   const char *sync_pattern,
735                   int sync_length,
736                   FrameAllocator::Frame *current_frame,
737                   const char *frame_type_name,
738                   function<void(const uint8_t *start)> start_callback)
739 {
740         int offset = 0;
741         for (int i = 0; i < xfr->num_iso_packets; i++) {
742                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
743
744                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
745                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
746                         continue;
747 //exit(5);
748                 }
749
750                 const uint8_t *start = xfr->buffer + offset;
751                 const uint8_t *limit = start + pack->actual_length;
752                 while (start < limit) {  // Usually runs only one iteration.
753                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
754                         if (start == limit) break;
755                         assert(start < limit);
756
757                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
758                         if (start_next_frame == nullptr) {
759                                 // add the rest of the buffer
760                                 add_to_frame(current_frame, frame_type_name, start, limit);
761                                 break;
762                         } else {
763                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
764                                 start = start_next_frame + sync_length;  // skip sync
765                                 start_callback(start);
766                         }
767                 }
768 #if 0
769                 dump_pack(xfr, offset, pack);
770 #endif
771                 offset += pack->length;
772         }
773 }
774
775 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
776 {
777         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
778             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
779                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
780                 libusb_free_transfer(xfr);
781                 exit(3);
782         }
783
784         assert(xfr->user_data != nullptr);
785         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
786
787         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
788                 if (!usb->disconnected) {
789                         fprintf(stderr, "Device went away, stopping transfers.\n");
790                         usb->disconnected = true;
791                         if (usb->card_disconnected_callback) {
792                                 usb->card_disconnected_callback();
793                         }
794                 }
795                 // Don't reschedule the transfer; the loop will stop by itself.
796                 return;
797         }
798
799         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
800                 if (xfr->endpoint == 0x84) {
801                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
802                 } else {
803                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
804
805                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
806                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
807                 }
808         }
809         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
810                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
811                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
812 #if 0
813                 if (setup->wIndex == 44) {
814                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
815                 } else {
816                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
817                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
818                 }
819 #else
820                 memcpy(usb->register_file + usb->current_register, buf, 4);
821                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
822                 if (usb->current_register == 0) {
823                         // read through all of them
824                         printf("register dump:");
825                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
826                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
827                         }
828                         printf("\n");
829                 }
830                 libusb_fill_control_setup(xfr->buffer,
831                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
832                         /*index=*/usb->current_register, /*length=*/4);
833 #endif
834         }
835
836 #if 0
837         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
838         for (i = 0; i < xfr->actual_length; i++) {
839                 printf("%02x", xfr->buffer[i]);
840                 if (i % 16)
841                         printf("\n");
842                 else if (i % 8)
843                         printf("  ");
844                 else
845                         printf(" ");
846         }
847 #endif
848
849         int rc = libusb_submit_transfer(xfr);
850         if (rc < 0) {
851                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
852                 exit(1);
853         }
854 }
855
856 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
857 {
858         if (card_connected_callback != nullptr) {
859                 libusb_device_descriptor desc;
860                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
861                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
862                         libusb_unref_device(dev);
863                         return 1;
864                 }
865
866                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
867                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
868                         card_connected_callback(dev);  // Callback takes ownership.
869                         return 0;
870                 }
871         }
872         libusb_unref_device(dev);
873         return 0;
874 }
875
876 void BMUSBCapture::usb_thread_func()
877 {
878         sched_param param;
879         memset(&param, 0, sizeof(param));
880         param.sched_priority = 1;
881         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
882                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
883         }
884         while (!should_quit) {
885                 timeval sec { 1, 0 };
886                 int rc = libusb_handle_events_timeout(nullptr, &sec);
887                 if (rc != LIBUSB_SUCCESS)
888                         break;
889         }
890 }
891
892 namespace {
893
894 struct USBCardDevice {
895         uint16_t product;
896         uint8_t bus, port;
897         libusb_device *device;
898 };
899
900 const char *get_product_name(uint16_t product)
901 {
902         if (product == 0xbd3b) {
903                 return "Intensity Shuttle";
904         } else if (product == 0xbd4f) {
905                 return "UltraStudio SDI";
906         } else {
907                 assert(false);
908                 return nullptr;
909         }
910 }
911
912 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
913 {
914         const char *product_name = get_product_name(product);
915
916         char buf[256];
917         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
918                 id, bus, port, product_name);
919         return buf;
920 }
921
922 vector<USBCardDevice> find_all_cards()
923 {
924         libusb_device **devices;
925         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
926         if (num_devices == -1) {
927                 fprintf(stderr, "Error finding USB devices\n");
928                 exit(1);
929         }
930         vector<USBCardDevice> found_cards;
931         for (ssize_t i = 0; i < num_devices; ++i) {
932                 libusb_device_descriptor desc;
933                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
934                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
935                         exit(1);
936                 }
937
938                 uint8_t bus = libusb_get_bus_number(devices[i]);
939                 uint8_t port = libusb_get_port_number(devices[i]);
940
941                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
942                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
943                         libusb_unref_device(devices[i]);
944                         continue;
945                 }
946
947                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
948         }
949         libusb_free_device_list(devices, 0);
950
951         // Sort the devices to get a consistent ordering.
952         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
953                 if (a.product != b.product)
954                         return a.product < b.product;
955                 if (a.bus != b.bus)
956                         return a.bus < b.bus;
957                 return a.port < b.port;
958         });
959
960         return found_cards;
961 }
962
963 libusb_device_handle *open_card(int card_index, string *description)
964 {
965         vector<USBCardDevice> found_cards = find_all_cards();
966
967         for (size_t i = 0; i < found_cards.size(); ++i) {
968                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
969                 fprintf(stderr, "%s\n", tmp_description.c_str());
970                 if (i == size_t(card_index)) {
971                         *description = tmp_description;
972                 }
973         }
974
975         if (size_t(card_index) >= found_cards.size()) {
976                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
977                 exit(1);
978         }
979
980         libusb_device_handle *devh;
981         int rc = libusb_open(found_cards[card_index].device, &devh);
982         if (rc < 0) {
983                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
984                 exit(1);
985         }
986
987         for (size_t i = 0; i < found_cards.size(); ++i) {
988                 libusb_unref_device(found_cards[i].device);
989         }
990
991         return devh;
992 }
993
994 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
995 {
996         uint8_t bus = libusb_get_bus_number(dev);
997         uint8_t port = libusb_get_port_number(dev);
998
999         libusb_device_descriptor desc;
1000         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1001                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1002                 exit(1);
1003         }
1004
1005         *description = get_card_description(card_index, bus, port, desc.idProduct);
1006
1007         libusb_device_handle *devh;
1008         int rc = libusb_open(dev, &devh);
1009         if (rc < 0) {
1010                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1011                 exit(1);
1012         }
1013
1014         return devh;
1015 }
1016
1017 }  // namespace
1018
1019 unsigned BMUSBCapture::num_cards()
1020 {
1021         int rc = libusb_init(nullptr);
1022         if (rc < 0) {
1023                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1024                 exit(1);
1025         }
1026
1027         vector<USBCardDevice> found_cards = find_all_cards();
1028         unsigned ret = found_cards.size();
1029         for (size_t i = 0; i < found_cards.size(); ++i) {
1030                 libusb_unref_device(found_cards[i].device);
1031         }
1032         return ret;
1033 }
1034
1035 void BMUSBCapture::configure_card()
1036 {
1037         if (video_frame_allocator == nullptr) {
1038                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1039                 set_video_frame_allocator(owned_video_frame_allocator.get());
1040         }
1041         if (audio_frame_allocator == nullptr) {
1042                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1043                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1044         }
1045         dequeue_thread_should_quit = false;
1046         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1047
1048         int rc;
1049         struct libusb_transfer *xfr;
1050
1051         rc = libusb_init(nullptr);
1052         if (rc < 0) {
1053                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1054                 exit(1);
1055         }
1056
1057         if (dev == nullptr) {
1058                 devh = open_card(card_index, &description);
1059         } else {
1060                 devh = open_card(card_index, dev, &description);
1061                 libusb_unref_device(dev);
1062         }
1063         if (!devh) {
1064                 fprintf(stderr, "Error finding USB device\n");
1065                 exit(1);
1066         }
1067
1068         libusb_config_descriptor *config;
1069         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1070         if (rc < 0) {
1071                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1072                 exit(1);
1073         }
1074
1075 #if 0
1076         printf("%d interface\n", config->bNumInterfaces);
1077         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1078                 printf("  interface %d\n", interface_number);
1079                 const libusb_interface *interface = &config->interface[interface_number];
1080                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1081                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1082                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1083                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1084                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1085                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1086                         }
1087                 }
1088         }
1089 #endif
1090
1091         rc = libusb_set_configuration(devh, /*configuration=*/1);
1092         if (rc < 0) {
1093                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1094                 exit(1);
1095         }
1096
1097         rc = libusb_claim_interface(devh, 0);
1098         if (rc < 0) {
1099                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1100                 exit(1);
1101         }
1102
1103         // Alternate setting 1 is output, alternate setting 2 is input.
1104         // Card is reset when switching alternates, so the driver uses
1105         // this “double switch” when it wants to reset.
1106         //
1107         // There's also alternate settings 3 and 4, which seem to be
1108         // like 1 and 2 except they advertise less bandwidth needed.
1109         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1110         if (rc < 0) {
1111                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1112                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1113                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1114                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1115                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1116                 }
1117                 exit(1);
1118         }
1119         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1120         if (rc < 0) {
1121                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1122                 exit(1);
1123         }
1124 #if 0
1125         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1126         if (rc < 0) {
1127                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1128                 exit(1);
1129         }
1130 #endif
1131
1132 #if 0
1133         rc = libusb_claim_interface(devh, 3);
1134         if (rc < 0) {
1135                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1136                 exit(1);
1137         }
1138 #endif
1139
1140         // theories:
1141         //   44 is some kind of timer register (first 16 bits count upwards)
1142         //   24 is some sort of watchdog?
1143         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1144         //      (or will go to 0x73c60010?), also seen 0x73c60100
1145         //   12 also changes all the time, unclear why  
1146         //   16 seems to be autodetected mode somehow
1147         //      --    this is e00115e0 after reset?
1148         //                    ed0115e0 after mode change [to output?]
1149         //                    2d0015e0 after more mode change [to input]
1150         //                    ed0115e0 after more mode change
1151         //                    2d0015e0 after more mode change
1152         //
1153         //                    390115e0 seems to indicate we have signal
1154         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1155         //
1156         //                    200015e0 on startup
1157         //         changes to 250115e0 when we sync to the signal
1158         //
1159         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1160         //
1161         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1162         //
1163         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1164         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1165         //
1166         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1167         //    perhaps some of them are related to analog output?
1168         //
1169         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1170         //    but the driver sets it to 0x8036802a at some point.
1171         //
1172         //    all of this is on request 214/215. other requests (192, 219,
1173         //    222, 223, 224) are used for firmware upgrade. Probably best to
1174         //    stay out of it unless you know what you're doing.
1175         //
1176         //
1177         // register 16:
1178         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1179         //
1180         // theories:
1181         //   0x01 - stable signal
1182         //   0x04 - deep color
1183         //   0x08 - unknown (audio??)
1184         //   0x20 - 720p??
1185         //   0x30 - 576p??
1186
1187         update_capture_mode();
1188
1189         struct ctrl {
1190                 int endpoint;
1191                 int request;
1192                 int index;
1193                 uint32_t data;
1194         };
1195         static const ctrl ctrls[] = {
1196                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1197                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1198
1199                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1200                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1201                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1202                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1203         };
1204
1205         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1206                 uint32_t flipped = htonl(ctrls[req].data);
1207                 static uint8_t value[4];
1208                 memcpy(value, &flipped, sizeof(flipped));
1209                 int size = sizeof(value);
1210                 //if (ctrls[req].request == 215) size = 0;
1211                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1212                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1213                 if (rc < 0) {
1214                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1215                         exit(1);
1216                 }
1217
1218                 if (ctrls[req].index == 16 && rc == 4) {
1219                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1220                 }
1221
1222 #if 0
1223                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1224                 for (int i = 0; i < rc; ++i) {
1225                         printf("%02x", value[i]);
1226                 }
1227                 printf("\n");
1228 #endif
1229         }
1230
1231 #if 0
1232         // DEBUG
1233         for ( ;; ) {
1234                 static int my_index = 0;
1235                 static uint8_t value[4];
1236                 int size = sizeof(value);
1237                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1238                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1239                 if (rc < 0) {
1240                         fprintf(stderr, "Error on control\n");
1241                         exit(1);
1242                 }
1243                 printf("rc=%d index=%d: 0x", rc, my_index);
1244                 for (int i = 0; i < rc; ++i) {
1245                         printf("%02x", value[i]);
1246                 }
1247                 printf("\n");
1248         }
1249 #endif
1250
1251 #if 0
1252         // set up an asynchronous transfer of the timer register
1253         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1254         static int completed = 0;
1255
1256         xfr = libusb_alloc_transfer(0);
1257         libusb_fill_control_setup(cmdbuf,
1258             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1259                 /*index=*/44, /*length=*/4);
1260         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1261         xfr->user_data = this;
1262         libusb_submit_transfer(xfr);
1263
1264         // set up an asynchronous transfer of register 24
1265         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1266         static int completed2 = 0;
1267
1268         xfr = libusb_alloc_transfer(0);
1269         libusb_fill_control_setup(cmdbuf2,
1270             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1271                 /*index=*/24, /*length=*/4);
1272         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1273         xfr->user_data = this;
1274         libusb_submit_transfer(xfr);
1275 #endif
1276
1277         // set up an asynchronous transfer of the register dump
1278         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1279         static int completed3 = 0;
1280
1281         xfr = libusb_alloc_transfer(0);
1282         libusb_fill_control_setup(cmdbuf3,
1283             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1284                 /*index=*/current_register, /*length=*/4);
1285         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1286         xfr->user_data = this;
1287         //libusb_submit_transfer(xfr);
1288
1289         //audiofp = fopen("audio.raw", "wb");
1290
1291         // set up isochronous transfers for audio and video
1292         for (int e = 3; e <= 4; ++e) {
1293                 //int num_transfers = (e == 3) ? 6 : 6;
1294                 int num_transfers = 6;
1295                 for (int i = 0; i < num_transfers; ++i) {
1296                         size_t buf_size;
1297                         int num_iso_pack, size;
1298                         if (e == 3) {
1299                                 // Allocate for minimum width (because that will give us the most
1300                                 // number of packets, so we don't need to reallocated, but we'll
1301                                 // default to 720p for the first frame.
1302                                 size = find_xfer_size_for_width(MIN_WIDTH);
1303                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1304                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1305                         } else {
1306                                 size = 0xc0;
1307                                 num_iso_pack = 80;
1308                                 buf_size = num_iso_pack * size;
1309                         }
1310                         int num_bytes = num_iso_pack * size;
1311                         assert(size_t(num_bytes) <= buf_size);
1312 #if LIBUSB_API_VERSION >= 0x01000105
1313                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1314 #else
1315                         uint8_t *buf = nullptr;
1316 #endif
1317                         if (buf == nullptr) {
1318                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1319 #if LIBUSB_API_VERSION >= 0x01000105
1320                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1321 #else
1322                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1323 #endif
1324                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1325                                 buf = new uint8_t[num_bytes];
1326                         }
1327
1328                         xfr = libusb_alloc_transfer(num_iso_pack);
1329                         if (!xfr) {
1330                                 fprintf(stderr, "oom\n");
1331                                 exit(1);
1332                         }
1333
1334                         int ep = LIBUSB_ENDPOINT_IN | e;
1335                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1336                                 num_iso_pack, cb_xfr, nullptr, 0);
1337                         libusb_set_iso_packet_lengths(xfr, size);
1338                         xfr->user_data = this;
1339
1340                         if (e == 3) {
1341                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1342                         }
1343
1344                         iso_xfrs.push_back(xfr);
1345                 }
1346         }
1347 }
1348
1349 void BMUSBCapture::start_bm_capture()
1350 {
1351         int i = 0;
1352         for (libusb_transfer *xfr : iso_xfrs) {
1353                 int rc = libusb_submit_transfer(xfr);
1354                 ++i;
1355                 if (rc < 0) {
1356                         //printf("num_bytes=%d\n", num_bytes);
1357                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1358                                 xfr->endpoint, i, libusb_error_name(rc));
1359                         exit(1);
1360                 }
1361         }
1362
1363
1364 #if 0
1365         libusb_release_interface(devh, 0);
1366 out:
1367         if (devh)
1368                 libusb_close(devh);
1369         libusb_exit(nullptr);
1370         return rc;
1371 #endif
1372 }
1373
1374 void BMUSBCapture::stop_dequeue_thread()
1375 {
1376         dequeue_thread_should_quit = true;
1377         queues_not_empty.notify_all();
1378         dequeue_thread.join();
1379 }
1380
1381 void BMUSBCapture::start_bm_thread()
1382 {
1383         // Devices leaving are discovered by seeing the isochronous packets
1384         // coming back with errors, so only care about devices joining.
1385         if (card_connected_callback != nullptr) {
1386                 if (libusb_hotplug_register_callback(
1387                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1388                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1389                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1390                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1391                         exit(1);
1392                 }
1393         }
1394
1395         should_quit = false;
1396         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1397 }
1398
1399 void BMUSBCapture::stop_bm_thread()
1400 {
1401         should_quit = true;
1402         usb_thread.join();
1403 }
1404
1405 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1406 {
1407         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1408         VideoMode auto_mode;
1409         auto_mode.name = "Autodetect";
1410         auto_mode.autodetect = true;
1411         return {{ 0, auto_mode }};
1412 }
1413
1414 uint32_t BMUSBCapture::get_current_video_mode() const
1415 {
1416         return 0;  // Matches get_available_video_modes().
1417 }
1418
1419 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1420 {
1421         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1422 }
1423
1424 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1425 {
1426         return {
1427                 { 0x00000000, "HDMI/SDI" },
1428                 { 0x02000000, "Component" },
1429                 { 0x04000000, "Composite" },
1430                 { 0x06000000, "S-video" }
1431         };
1432 }
1433
1434 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1435 {
1436         assert((video_input_id & ~0x06000000) == 0);
1437         current_video_input = video_input_id;
1438         update_capture_mode();
1439 }
1440
1441 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1442 {
1443         return {
1444                 { 0x00000000, "Embedded" },
1445                 { 0x10000000, "Analog" }
1446         };
1447 }
1448
1449 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1450 {
1451         assert((audio_input_id & ~0x10000000) == 0);
1452         current_audio_input = audio_input_id;
1453         update_capture_mode();
1454 }
1455
1456 void BMUSBCapture::update_capture_mode()
1457 {
1458         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1459         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1460         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1461
1462         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1463                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1464         if (rc < 0) {
1465                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1466                 exit(1);
1467         }
1468 }
1469
1470 }  // namespace bmusb