]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Set libusb timeout to one second, to help shutdown if the thread is started with...
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.4
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <sched.h>
17 #include <stdint.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #if HAS_MULTIVERSIONING
22 #include <immintrin.h>
23 #endif
24 #include "bmusb/bmusb.h"
25
26 #include <algorithm>
27 #include <atomic>
28 #include <condition_variable>
29 #include <cstddef>
30 #include <cstdint>
31 #include <deque>
32 #include <functional>
33 #include <memory>
34 #include <mutex>
35 #include <stack>
36 #include <string>
37 #include <thread>
38
39 using namespace std;
40 using namespace std::placeholders;
41
42 #define USB_VENDOR_BLACKMAGIC 0x1edb
43 #define MIN_WIDTH 640
44 #define HEADER_SIZE 44
45 //#define HEADER_SIZE 0
46 #define AUDIO_HEADER_SIZE 4
47
48 #define FRAME_SIZE (8 << 20)  // 8 MB.
49 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
50
51 namespace bmusb {
52
53 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
54 bool BMUSBCapture::hotplug_existing_devices = false;
55
56 namespace {
57
58 FILE *audiofp;
59
60 thread usb_thread;
61 atomic<bool> should_quit;
62
63 int find_xfer_size_for_width(int width)
64 {
65         // Video seems to require isochronous packets scaled with the width;
66         // seemingly six lines is about right, rounded up to the required 1kB
67         // multiple.
68         int size = width * 2 * 6;
69         // Note that for 10-bit input, you'll need to increase size accordingly.
70         //size = size * 4 / 3;
71         if (size % 1024 != 0) {
72                 size &= ~1023;
73                 size += 1024;
74         }
75         return size;
76 }
77
78 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
79 {
80         assert(width >= MIN_WIDTH);
81         size_t size = find_xfer_size_for_width(width);
82         int num_iso_pack = xfr->length / size;
83         if (num_iso_pack != xfr->num_iso_packets ||
84             size != xfr->iso_packet_desc[0].length) {
85                 xfr->num_iso_packets = num_iso_pack;
86                 libusb_set_iso_packet_lengths(xfr, size);
87         }
88 }
89
90 struct VideoFormatEntry {
91         uint16_t normalized_video_format;
92         unsigned width, height, second_field_start;
93         unsigned extra_lines_top, extra_lines_bottom;
94         unsigned frame_rate_nom, frame_rate_den;
95         bool interlaced;
96 };
97
98 // Get details for the given video format; returns false if detection was incomplete.
99 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
100 {
101         decoded_video_format->id = video_format;
102         decoded_video_format->interlaced = false;
103
104         // TODO: Add these for all formats as we find them.
105         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
106
107         if (video_format == 0x0800) {
108                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
109                 // It's a strange thing, but what can you do.
110                 decoded_video_format->width = 720;
111                 decoded_video_format->height = 525;
112                 decoded_video_format->extra_lines_top = 0;
113                 decoded_video_format->extra_lines_bottom = 0;
114                 decoded_video_format->frame_rate_nom = 3013;
115                 decoded_video_format->frame_rate_den = 100;
116                 decoded_video_format->has_signal = false;
117                 return true;
118         }
119         if ((video_format & 0xe800) != 0xe800) {
120                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
121                         video_format);
122                 decoded_video_format->width = 0;
123                 decoded_video_format->height = 0;
124                 decoded_video_format->extra_lines_top = 0;
125                 decoded_video_format->extra_lines_bottom = 0;
126                 decoded_video_format->frame_rate_nom = 60;
127                 decoded_video_format->frame_rate_den = 1;
128                 decoded_video_format->has_signal = false;
129                 return false;
130         }
131
132         decoded_video_format->has_signal = true;
133
134         // NTSC (480i59.94, I suppose). A special case, see below.
135         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
136                 decoded_video_format->width = 720;
137                 decoded_video_format->height = 480;
138                 decoded_video_format->extra_lines_top = 17;
139                 decoded_video_format->extra_lines_bottom = 28;
140                 decoded_video_format->frame_rate_nom = 30000;
141                 decoded_video_format->frame_rate_den = 1001;
142                 decoded_video_format->second_field_start = 280;
143                 decoded_video_format->interlaced = true;
144                 return true;
145         }
146
147         // PAL (576i50, I suppose). A special case, see below.
148         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
149                 decoded_video_format->width = 720;
150                 decoded_video_format->height = 576;
151                 decoded_video_format->extra_lines_top = 22;
152                 decoded_video_format->extra_lines_bottom = 27;
153                 decoded_video_format->frame_rate_nom = 25;
154                 decoded_video_format->frame_rate_den = 1;
155                 decoded_video_format->second_field_start = 335;
156                 decoded_video_format->interlaced = true;
157                 return true;
158         }
159
160         // 0x8 seems to be a flag about availability of deep color on the input,
161         // except when it's not (e.g. it's the only difference between NTSC
162         // and PAL). Rather confusing. But we clear it here nevertheless, because
163         // usually it doesn't mean anything.
164         //
165         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
166         uint16_t normalized_video_format = video_format & ~0xe80c;
167         constexpr VideoFormatEntry entries[] = {
168                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
169                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
170                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
171                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
172                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
173                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
174                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
175                 { 0x01c3, 1920, 1080,   0,  0,  0,    30,    1, false },  // 1080p30.
176                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
177                 { 0x01e1, 1920, 1080,   0,  0,  0, 30000, 1001, false },  // 1080p29.97.
178                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
179                 { 0x0063, 1920, 1080,   0,  0,  0,    25,    1, false },  // 1080p25.
180                 { 0x0043, 1920, 1080,   0,  0,  0,    25,    1,  true },  // 1080p50.
181                 { 0x008e, 1920, 1080,   0,  0,  0,    24,    1, false },  // 1080p24.
182                 { 0x00a1, 1920, 1080,   0,  0,  0, 24000, 1001, false },  // 1080p23.98.
183         };
184         for (const VideoFormatEntry &entry : entries) {
185                 if (normalized_video_format == entry.normalized_video_format) {
186                         decoded_video_format->width = entry.width;
187                         decoded_video_format->height = entry.height;
188                         decoded_video_format->second_field_start = entry.second_field_start;
189                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
190                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
191                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
192                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
193                         decoded_video_format->interlaced = entry.interlaced;
194                         return true;
195                 }
196         }
197
198         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
199         decoded_video_format->width = 1280;
200         decoded_video_format->height = 720;
201         decoded_video_format->frame_rate_nom = 60;
202         decoded_video_format->frame_rate_den = 1;
203         return false;
204 }
205
206 }  // namespace
207
208 FrameAllocator::~FrameAllocator() {}
209
210 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
211         : frame_size(frame_size)
212 {
213         for (size_t i = 0; i < num_queued_frames; ++i) {
214                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
215         }
216 }
217
218 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
219 {
220         Frame vf;
221         vf.owner = this;
222
223         unique_lock<mutex> lock(freelist_mutex);  // Meh.
224         if (freelist.empty()) {
225                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
226                         frame_size);
227         } else {
228                 vf.data = freelist.top().release();
229                 vf.size = frame_size;
230                 freelist.pop();  // Meh.
231         }
232         return vf;
233 }
234
235 void MallocFrameAllocator::release_frame(Frame frame)
236 {
237         if (frame.overflow > 0) {
238                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
239         }
240         unique_lock<mutex> lock(freelist_mutex);
241         freelist.push(unique_ptr<uint8_t[]>(frame.data));
242 }
243
244 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
245 {
246         if (a == b) {
247                 return false;
248         } else if (a < b) {
249                 return (b - a < 0x8000);
250         } else {
251                 int wrap_b = 0x10000 + int(b);
252                 return (wrap_b - a < 0x8000);
253         }
254 }
255
256 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
257 {
258         unique_lock<mutex> lock(queue_lock);
259         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
260                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
261                         q->back().timecode, timecode);
262                 frame.owner->release_frame(frame);
263                 return;
264         }
265
266         QueuedFrame qf;
267         qf.format = format;
268         qf.timecode = timecode;
269         qf.frame = frame;
270         q->push_back(move(qf));
271         queues_not_empty.notify_one();  // might be spurious
272 }
273
274 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
275 {
276         FILE *fp = fopen(filename, "wb");
277         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
278                 printf("short write!\n");
279         }
280         fclose(fp);
281 }
282
283 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
284 {
285         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
286 }
287
288 void BMUSBCapture::dequeue_thread_func()
289 {
290         if (has_dequeue_callbacks) {
291                 dequeue_init_callback();
292         }
293         while (!dequeue_thread_should_quit) {
294                 unique_lock<mutex> lock(queue_lock);
295                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
296
297                 if (dequeue_thread_should_quit) break;
298
299                 uint16_t video_timecode = pending_video_frames.front().timecode;
300                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
301                 AudioFormat audio_format;
302                 audio_format.bits_per_sample = 24;
303                 audio_format.num_channels = 8;
304                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
305                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
306                                 video_timecode);
307                         QueuedFrame video_frame = pending_video_frames.front();
308                         pending_video_frames.pop_front();
309                         lock.unlock();
310                         video_frame_allocator->release_frame(video_frame.frame);
311                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
312                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
313                                 audio_timecode);
314                         QueuedFrame audio_frame = pending_audio_frames.front();
315                         pending_audio_frames.pop_front();
316                         lock.unlock();
317                         audio_format.id = audio_frame.format;
318
319                         // Use the video format of the pending frame.
320                         QueuedFrame video_frame = pending_video_frames.front();
321                         VideoFormat video_format;
322                         decode_video_format(video_frame.format, &video_format);
323
324                         frame_callback(audio_timecode,
325                                        FrameAllocator::Frame(), 0, video_format,
326                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
327                 } else {
328                         QueuedFrame video_frame = pending_video_frames.front();
329                         QueuedFrame audio_frame = pending_audio_frames.front();
330                         pending_audio_frames.pop_front();
331                         pending_video_frames.pop_front();
332                         lock.unlock();
333
334 #if 0
335                         char filename[255];
336                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
337                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
338                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
339 #endif
340
341                         VideoFormat video_format;
342                         audio_format.id = audio_frame.format;
343                         if (decode_video_format(video_frame.format, &video_format)) {
344                                 frame_callback(video_timecode,
345                                                video_frame.frame, HEADER_SIZE, video_format,
346                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
347                         } else {
348                                 frame_callback(video_timecode,
349                                                FrameAllocator::Frame(), 0, video_format,
350                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
351                         }
352                 }
353         }
354         if (has_dequeue_callbacks) {
355                 dequeue_cleanup_callback();
356         }
357 }
358
359 void BMUSBCapture::start_new_frame(const uint8_t *start)
360 {
361         uint16_t format = (start[3] << 8) | start[2];
362         uint16_t timecode = (start[1] << 8) | start[0];
363
364         if (current_video_frame.len > 0) {
365                 // If format is 0x0800 (no signal), add a fake (empty) audio
366                 // frame to get it out of the queue.
367                 // TODO: Figure out if there are other formats that come with
368                 // no audio, and treat them the same.
369                 if (format == 0x0800) {
370                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
371                         if (fake_audio_frame.data == nullptr) {
372                                 // Oh well, it's just a no-signal frame anyway.
373                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
374                                 current_video_frame.owner->release_frame(current_video_frame);
375                                 current_video_frame = video_frame_allocator->alloc_frame();
376                                 return;
377                         }
378                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
379                 }
380                 //dump_frame();
381                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
382
383                 // Update the assumed frame width. We might be one frame too late on format changes,
384                 // but it's much better than asking the user to choose manually.
385                 VideoFormat video_format;
386                 if (decode_video_format(format, &video_format)) {
387                         assumed_frame_width = video_format.width;
388                 }
389         }
390         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
391         //      format, timecode,
392         //      //start[7], start[6], start[5], start[4],
393         //      read_current_frame, FRAME_SIZE);
394
395         current_video_frame = video_frame_allocator->alloc_frame();
396         //if (current_video_frame.data == nullptr) {
397         //      read_current_frame = -1;
398         //} else {
399         //      read_current_frame = 0;
400         //}
401 }
402
403 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
404 {
405         uint16_t format = (start[3] << 8) | start[2];
406         uint16_t timecode = (start[1] << 8) | start[0];
407         if (current_audio_frame.len > 0) {
408                 //dump_audio_block();
409                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
410         }
411         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
412         //      format, timecode, read_current_audio_block);
413         current_audio_frame = audio_frame_allocator->alloc_frame();
414 }
415
416 #if 0
417 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
418 {
419         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
420         for (unsigned j = 0; j < pack->actual_length; j++) {
421         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
422                 printf("%02x", xfr->buffer[j + offset]);
423                 if ((j % 16) == 15)
424                         printf("\n");
425                 else if ((j % 8) == 7)
426                         printf("  ");
427                 else
428                         printf(" ");
429         }
430 }
431 #endif
432
433 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
434 {
435         assert(n % 2 == 0);
436         uint8_t *dptr1 = dest1;
437         uint8_t *dptr2 = dest2;
438
439         for (size_t i = 0; i < n; i += 2) {
440                 *dptr1++ = *src++;
441                 *dptr2++ = *src++;
442         }
443 }
444
445 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
446 {
447         if (current_frame->data == nullptr ||
448             current_frame->len > current_frame->size ||
449             start == end) {
450                 return;
451         }
452
453         int bytes = end - start;
454         if (current_frame->len + bytes > current_frame->size) {
455                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
456                 current_frame->len = current_frame->size;
457                 if (current_frame->overflow > 1048576) {
458                         printf("%d bytes overflow after last %s frame\n",
459                                 int(current_frame->overflow), frame_type_name);
460                         current_frame->overflow = 0;
461                 }
462                 //dump_frame();
463         } else {
464                 if (current_frame->interleaved) {
465                         uint8_t *data = current_frame->data + current_frame->len / 2;
466                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
467                         if (current_frame->len % 2 == 1) {
468                                 ++data;
469                                 swap(data, data2);
470                         }
471                         if (bytes % 2 == 1) {
472                                 *data++ = *start++;
473                                 swap(data, data2);
474                                 ++current_frame->len;
475                                 --bytes;
476                         }
477                         memcpy_interleaved(data, data2, start, bytes);
478                         current_frame->len += bytes;
479                 } else {
480                         memcpy(current_frame->data + current_frame->len, start, bytes);
481                         current_frame->len += bytes;
482                 }
483         }
484 }
485
486 #if 0
487 void avx2_dump(const char *name, __m256i n)
488 {
489         printf("%-10s:", name);
490         printf(" %02x", _mm256_extract_epi8(n, 0));
491         printf(" %02x", _mm256_extract_epi8(n, 1));
492         printf(" %02x", _mm256_extract_epi8(n, 2));
493         printf(" %02x", _mm256_extract_epi8(n, 3));
494         printf(" %02x", _mm256_extract_epi8(n, 4));
495         printf(" %02x", _mm256_extract_epi8(n, 5));
496         printf(" %02x", _mm256_extract_epi8(n, 6));
497         printf(" %02x", _mm256_extract_epi8(n, 7));
498         printf(" ");
499         printf(" %02x", _mm256_extract_epi8(n, 8));
500         printf(" %02x", _mm256_extract_epi8(n, 9));
501         printf(" %02x", _mm256_extract_epi8(n, 10));
502         printf(" %02x", _mm256_extract_epi8(n, 11));
503         printf(" %02x", _mm256_extract_epi8(n, 12));
504         printf(" %02x", _mm256_extract_epi8(n, 13));
505         printf(" %02x", _mm256_extract_epi8(n, 14));
506         printf(" %02x", _mm256_extract_epi8(n, 15));
507         printf(" ");
508         printf(" %02x", _mm256_extract_epi8(n, 16));
509         printf(" %02x", _mm256_extract_epi8(n, 17));
510         printf(" %02x", _mm256_extract_epi8(n, 18));
511         printf(" %02x", _mm256_extract_epi8(n, 19));
512         printf(" %02x", _mm256_extract_epi8(n, 20));
513         printf(" %02x", _mm256_extract_epi8(n, 21));
514         printf(" %02x", _mm256_extract_epi8(n, 22));
515         printf(" %02x", _mm256_extract_epi8(n, 23));
516         printf(" ");
517         printf(" %02x", _mm256_extract_epi8(n, 24));
518         printf(" %02x", _mm256_extract_epi8(n, 25));
519         printf(" %02x", _mm256_extract_epi8(n, 26));
520         printf(" %02x", _mm256_extract_epi8(n, 27));
521         printf(" %02x", _mm256_extract_epi8(n, 28));
522         printf(" %02x", _mm256_extract_epi8(n, 29));
523         printf(" %02x", _mm256_extract_epi8(n, 30));
524         printf(" %02x", _mm256_extract_epi8(n, 31));
525         printf("\n");
526 }
527 #endif
528
529 #ifndef HAS_MULTIVERSIONING
530
531 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
532 {
533         // No fast path possible unless we have multiversioning.
534         return start;
535 }
536
537 #else  // defined(HAS_MULTIVERSIONING)
538
539 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
540
541 // Does a memcpy and memchr in one to reduce processing time.
542 // Note that the benefit is somewhat limited if your L3 cache is small,
543 // as you'll (unfortunately) spend most of the time loading the data
544 // from main memory.
545 //
546 // Complicated cases are left to the slow path; it basically stops copying
547 // up until the first instance of "sync_char" (usually a bit before, actually).
548 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
549 // data, and what we really need this for is the 00 00 ff ff marker in video data.
550 __attribute__((target("default")))
551 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
552 {
553         // No fast path possible unless we have SSE 4.1 or higher.
554         return start;
555 }
556
557 __attribute__((target("sse4.1", "avx2")))
558 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
559 {
560         if (current_frame->data == nullptr ||
561             current_frame->len > current_frame->size ||
562             start == limit) {
563                 return start;
564         }
565         size_t orig_bytes = limit - start;
566         if (orig_bytes < 128) {
567                 // Don't bother.
568                 return start;
569         }
570
571         // Don't read more bytes than we can write.
572         limit = min(limit, start + (current_frame->size - current_frame->len));
573
574         // Align end to 32 bytes.
575         limit = (const uint8_t *)(intptr_t(limit) & ~31);
576
577         if (start >= limit) {
578                 return start;
579         }
580
581         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
582         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
583         if (aligned_start != start) {
584                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
585                 if (sync_start == nullptr) {
586                         add_to_frame(current_frame, "", start, aligned_start);
587                 } else {
588                         add_to_frame(current_frame, "", start, sync_start);
589                         return sync_start;
590                 }
591         }
592
593         // Make the length a multiple of 64.
594         if (current_frame->interleaved) {
595                 if (((limit - aligned_start) % 64) != 0) {
596                         limit -= 32;
597                 }
598                 assert(((limit - aligned_start) % 64) == 0);
599         }
600
601         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
602 }
603
604 __attribute__((target("avx2")))
605 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
606 {
607         const __m256i needle = _mm256_set1_epi8(sync_char);
608
609         const __restrict __m256i *in = (const __m256i *)aligned_start;
610         if (current_frame->interleaved) {
611                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
612                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
613                 if (current_frame->len % 2 == 1) {
614                         swap(out1, out2);
615                 }
616
617                 __m256i shuffle_cw = _mm256_set_epi8(
618                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
619                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
620                 while (in < (const __m256i *)limit) {
621                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
622                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
623                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
624
625                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
626                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
627                         __m256i found = _mm256_or_si256(found1, found2);
628
629                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
630                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
631                 
632                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
633                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
634
635                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
636                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
637
638                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
639                         _mm256_storeu_si256(out2, hi);
640
641                         if (!_mm256_testz_si256(found, found)) {
642                                 break;
643                         }
644
645                         in += 2;
646                         ++out1;
647                         ++out2;
648                 }
649                 current_frame->len += (uint8_t *)in - aligned_start;
650         } else {
651                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
652                 while (in < (const __m256i *)limit) {
653                         __m256i data = _mm256_load_si256(in);
654                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
655                         __m256i found = _mm256_cmpeq_epi8(data, needle);
656                         if (!_mm256_testz_si256(found, found)) {
657                                 break;
658                         }
659
660                         ++in;
661                         ++out;
662                 }
663                 current_frame->len = (uint8_t *)out - current_frame->data;
664         }
665
666         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
667         return (const uint8_t *)in;
668 }
669
670 __attribute__((target("sse4.1")))
671 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
672 {
673         const __m128i needle = _mm_set1_epi8(sync_char);
674
675         const __m128i *in = (const __m128i *)aligned_start;
676         if (current_frame->interleaved) {
677                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
678                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
679                 if (current_frame->len % 2 == 1) {
680                         swap(out1, out2);
681                 }
682
683                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
684                 while (in < (const __m128i *)limit) {
685                         __m128i data1 = _mm_load_si128(in);
686                         __m128i data2 = _mm_load_si128(in + 1);
687                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
688                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
689                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
690                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
691                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
692                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
693                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
694                         _mm_storeu_si128(out2, hi);
695                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
696                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
697                         if (!_mm_testz_si128(found1, found1) ||
698                             !_mm_testz_si128(found2, found2)) {
699                                 break;
700                         }
701
702                         in += 2;
703                         ++out1;
704                         ++out2;
705                 }
706                 current_frame->len += (uint8_t *)in - aligned_start;
707         } else {
708                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
709                 while (in < (const __m128i *)limit) {
710                         __m128i data = _mm_load_si128(in);
711                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
712                         __m128i found = _mm_cmpeq_epi8(data, needle);
713                         if (!_mm_testz_si128(found, found)) {
714                                 break;
715                         }
716
717                         ++in;
718                         ++out;
719                 }
720                 current_frame->len = (uint8_t *)out - current_frame->data;
721         }
722
723         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
724         return (const uint8_t *)in;
725 }
726
727 #endif  // defined(HAS_MULTIVERSIONING)
728
729 void decode_packs(const libusb_transfer *xfr,
730                   const char *sync_pattern,
731                   int sync_length,
732                   FrameAllocator::Frame *current_frame,
733                   const char *frame_type_name,
734                   function<void(const uint8_t *start)> start_callback)
735 {
736         int offset = 0;
737         for (int i = 0; i < xfr->num_iso_packets; i++) {
738                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
739
740                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
741                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
742                         continue;
743 //exit(5);
744                 }
745
746                 const uint8_t *start = xfr->buffer + offset;
747                 const uint8_t *limit = start + pack->actual_length;
748                 while (start < limit) {  // Usually runs only one iteration.
749                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
750                         if (start == limit) break;
751                         assert(start < limit);
752
753                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
754                         if (start_next_frame == nullptr) {
755                                 // add the rest of the buffer
756                                 add_to_frame(current_frame, frame_type_name, start, limit);
757                                 break;
758                         } else {
759                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
760                                 start = start_next_frame + sync_length;  // skip sync
761                                 start_callback(start);
762                         }
763                 }
764 #if 0
765                 dump_pack(xfr, offset, pack);
766 #endif
767                 offset += pack->length;
768         }
769 }
770
771 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
772 {
773         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
774             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
775                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
776                 libusb_free_transfer(xfr);
777                 exit(3);
778         }
779
780         assert(xfr->user_data != nullptr);
781         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
782
783         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
784                 if (!usb->disconnected) {
785                         fprintf(stderr, "Device went away, stopping transfers.\n");
786                         usb->disconnected = true;
787                         if (usb->card_disconnected_callback) {
788                                 usb->card_disconnected_callback();
789                         }
790                 }
791                 // Don't reschedule the transfer; the loop will stop by itself.
792                 return;
793         }
794
795         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
796                 if (xfr->endpoint == 0x84) {
797                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
798                 } else {
799                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
800
801                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
802                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
803                 }
804         }
805         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
806                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
807                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
808 #if 0
809                 if (setup->wIndex == 44) {
810                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
811                 } else {
812                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
813                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
814                 }
815 #else
816                 memcpy(usb->register_file + usb->current_register, buf, 4);
817                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
818                 if (usb->current_register == 0) {
819                         // read through all of them
820                         printf("register dump:");
821                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
822                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
823                         }
824                         printf("\n");
825                 }
826                 libusb_fill_control_setup(xfr->buffer,
827                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
828                         /*index=*/usb->current_register, /*length=*/4);
829 #endif
830         }
831
832 #if 0
833         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
834         for (i = 0; i < xfr->actual_length; i++) {
835                 printf("%02x", xfr->buffer[i]);
836                 if (i % 16)
837                         printf("\n");
838                 else if (i % 8)
839                         printf("  ");
840                 else
841                         printf(" ");
842         }
843 #endif
844
845         int rc = libusb_submit_transfer(xfr);
846         if (rc < 0) {
847                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
848                 exit(1);
849         }
850 }
851
852 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
853 {
854         if (card_connected_callback != nullptr) {
855                 libusb_device_descriptor desc;
856                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
857                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
858                         libusb_unref_device(dev);
859                         return 1;
860                 }
861
862                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
863                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
864                         card_connected_callback(dev);  // Callback takes ownership.
865                         return 0;
866                 }
867         }
868         libusb_unref_device(dev);
869         return 0;
870 }
871
872 void BMUSBCapture::usb_thread_func()
873 {
874         sched_param param;
875         memset(&param, 0, sizeof(param));
876         param.sched_priority = 1;
877         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
878                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
879         }
880         while (!should_quit) {
881                 timeval sec { 1, 0 };
882                 int rc = libusb_handle_events_timeout(nullptr, &sec);
883                 if (rc != LIBUSB_SUCCESS)
884                         break;
885         }
886 }
887
888 namespace {
889
890 struct USBCardDevice {
891         uint16_t product;
892         uint8_t bus, port;
893         libusb_device *device;
894 };
895
896 const char *get_product_name(uint16_t product)
897 {
898         if (product == 0xbd3b) {
899                 return "Intensity Shuttle";
900         } else if (product == 0xbd4f) {
901                 return "UltraStudio SDI";
902         } else {
903                 assert(false);
904                 return nullptr;
905         }
906 }
907
908 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
909 {
910         const char *product_name = get_product_name(product);
911
912         char buf[256];
913         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
914                 id, bus, port, product_name);
915         return buf;
916 }
917
918 vector<USBCardDevice> find_all_cards()
919 {
920         libusb_device **devices;
921         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
922         if (num_devices == -1) {
923                 fprintf(stderr, "Error finding USB devices\n");
924                 exit(1);
925         }
926         vector<USBCardDevice> found_cards;
927         for (ssize_t i = 0; i < num_devices; ++i) {
928                 libusb_device_descriptor desc;
929                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
930                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
931                         exit(1);
932                 }
933
934                 uint8_t bus = libusb_get_bus_number(devices[i]);
935                 uint8_t port = libusb_get_port_number(devices[i]);
936
937                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
938                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
939                         libusb_unref_device(devices[i]);
940                         continue;
941                 }
942
943                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
944         }
945         libusb_free_device_list(devices, 0);
946
947         // Sort the devices to get a consistent ordering.
948         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
949                 if (a.product != b.product)
950                         return a.product < b.product;
951                 if (a.bus != b.bus)
952                         return a.bus < b.bus;
953                 return a.port < b.port;
954         });
955
956         return found_cards;
957 }
958
959 libusb_device_handle *open_card(int card_index, string *description)
960 {
961         vector<USBCardDevice> found_cards = find_all_cards();
962
963         for (size_t i = 0; i < found_cards.size(); ++i) {
964                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
965                 fprintf(stderr, "%s\n", tmp_description.c_str());
966                 if (i == size_t(card_index)) {
967                         *description = tmp_description;
968                 }
969         }
970
971         if (size_t(card_index) >= found_cards.size()) {
972                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
973                 exit(1);
974         }
975
976         libusb_device_handle *devh;
977         int rc = libusb_open(found_cards[card_index].device, &devh);
978         if (rc < 0) {
979                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
980                 exit(1);
981         }
982
983         for (size_t i = 0; i < found_cards.size(); ++i) {
984                 libusb_unref_device(found_cards[i].device);
985         }
986
987         return devh;
988 }
989
990 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
991 {
992         uint8_t bus = libusb_get_bus_number(dev);
993         uint8_t port = libusb_get_port_number(dev);
994
995         libusb_device_descriptor desc;
996         if (libusb_get_device_descriptor(dev, &desc) < 0) {
997                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
998                 exit(1);
999         }
1000
1001         *description = get_card_description(card_index, bus, port, desc.idProduct);
1002
1003         libusb_device_handle *devh;
1004         int rc = libusb_open(dev, &devh);
1005         if (rc < 0) {
1006                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1007                 exit(1);
1008         }
1009
1010         return devh;
1011 }
1012
1013 }  // namespace
1014
1015 unsigned BMUSBCapture::num_cards()
1016 {
1017         int rc = libusb_init(nullptr);
1018         if (rc < 0) {
1019                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1020                 exit(1);
1021         }
1022
1023         vector<USBCardDevice> found_cards = find_all_cards();
1024         unsigned ret = found_cards.size();
1025         for (size_t i = 0; i < found_cards.size(); ++i) {
1026                 libusb_unref_device(found_cards[i].device);
1027         }
1028         return ret;
1029 }
1030
1031 void BMUSBCapture::configure_card()
1032 {
1033         if (video_frame_allocator == nullptr) {
1034                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1035                 set_video_frame_allocator(owned_video_frame_allocator.get());
1036         }
1037         if (audio_frame_allocator == nullptr) {
1038                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1039                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1040         }
1041         dequeue_thread_should_quit = false;
1042         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1043
1044         int rc;
1045         struct libusb_transfer *xfr;
1046
1047         rc = libusb_init(nullptr);
1048         if (rc < 0) {
1049                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1050                 exit(1);
1051         }
1052
1053         if (dev == nullptr) {
1054                 devh = open_card(card_index, &description);
1055         } else {
1056                 devh = open_card(card_index, dev, &description);
1057                 libusb_unref_device(dev);
1058         }
1059         if (!devh) {
1060                 fprintf(stderr, "Error finding USB device\n");
1061                 exit(1);
1062         }
1063
1064         libusb_config_descriptor *config;
1065         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1066         if (rc < 0) {
1067                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1068                 exit(1);
1069         }
1070
1071 #if 0
1072         printf("%d interface\n", config->bNumInterfaces);
1073         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1074                 printf("  interface %d\n", interface_number);
1075                 const libusb_interface *interface = &config->interface[interface_number];
1076                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1077                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1078                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1079                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1080                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1081                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1082                         }
1083                 }
1084         }
1085 #endif
1086
1087         rc = libusb_set_configuration(devh, /*configuration=*/1);
1088         if (rc < 0) {
1089                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1090                 exit(1);
1091         }
1092
1093         rc = libusb_claim_interface(devh, 0);
1094         if (rc < 0) {
1095                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1096                 exit(1);
1097         }
1098
1099         // Alternate setting 1 is output, alternate setting 2 is input.
1100         // Card is reset when switching alternates, so the driver uses
1101         // this “double switch” when it wants to reset.
1102         //
1103         // There's also alternate settings 3 and 4, which seem to be
1104         // like 1 and 2 except they advertise less bandwidth needed.
1105         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1106         if (rc < 0) {
1107                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1108                 exit(1);
1109         }
1110         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1111         if (rc < 0) {
1112                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1113                 exit(1);
1114         }
1115 #if 0
1116         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1117         if (rc < 0) {
1118                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1119                 exit(1);
1120         }
1121 #endif
1122
1123 #if 0
1124         rc = libusb_claim_interface(devh, 3);
1125         if (rc < 0) {
1126                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1127                 exit(1);
1128         }
1129 #endif
1130
1131         // theories:
1132         //   44 is some kind of timer register (first 16 bits count upwards)
1133         //   24 is some sort of watchdog?
1134         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1135         //      (or will go to 0x73c60010?), also seen 0x73c60100
1136         //   12 also changes all the time, unclear why  
1137         //   16 seems to be autodetected mode somehow
1138         //      --    this is e00115e0 after reset?
1139         //                    ed0115e0 after mode change [to output?]
1140         //                    2d0015e0 after more mode change [to input]
1141         //                    ed0115e0 after more mode change
1142         //                    2d0015e0 after more mode change
1143         //
1144         //                    390115e0 seems to indicate we have signal
1145         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1146         //
1147         //                    200015e0 on startup
1148         //         changes to 250115e0 when we sync to the signal
1149         //
1150         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1151         //
1152         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1153         //
1154         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1155         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1156         //
1157         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1158         //    perhaps some of them are related to analog output?
1159         //
1160         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1161         //    but the driver sets it to 0x8036802a at some point.
1162         //
1163         //    all of this is on request 214/215. other requests (192, 219,
1164         //    222, 223, 224) are used for firmware upgrade. Probably best to
1165         //    stay out of it unless you know what you're doing.
1166         //
1167         //
1168         // register 16:
1169         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1170         //
1171         // theories:
1172         //   0x01 - stable signal
1173         //   0x04 - deep color
1174         //   0x08 - unknown (audio??)
1175         //   0x20 - 720p??
1176         //   0x30 - 576p??
1177
1178         update_capture_mode();
1179
1180         struct ctrl {
1181                 int endpoint;
1182                 int request;
1183                 int index;
1184                 uint32_t data;
1185         };
1186         static const ctrl ctrls[] = {
1187                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1188                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1189
1190                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1191                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1192                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1193                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1194         };
1195
1196         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1197                 uint32_t flipped = htonl(ctrls[req].data);
1198                 static uint8_t value[4];
1199                 memcpy(value, &flipped, sizeof(flipped));
1200                 int size = sizeof(value);
1201                 //if (ctrls[req].request == 215) size = 0;
1202                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1203                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1204                 if (rc < 0) {
1205                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1206                         exit(1);
1207                 }
1208
1209                 if (ctrls[req].index == 16 && rc == 4) {
1210                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1211                 }
1212
1213 #if 0
1214                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1215                 for (int i = 0; i < rc; ++i) {
1216                         printf("%02x", value[i]);
1217                 }
1218                 printf("\n");
1219 #endif
1220         }
1221
1222 #if 0
1223         // DEBUG
1224         for ( ;; ) {
1225                 static int my_index = 0;
1226                 static uint8_t value[4];
1227                 int size = sizeof(value);
1228                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1229                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1230                 if (rc < 0) {
1231                         fprintf(stderr, "Error on control\n");
1232                         exit(1);
1233                 }
1234                 printf("rc=%d index=%d: 0x", rc, my_index);
1235                 for (int i = 0; i < rc; ++i) {
1236                         printf("%02x", value[i]);
1237                 }
1238                 printf("\n");
1239         }
1240 #endif
1241
1242 #if 0
1243         // set up an asynchronous transfer of the timer register
1244         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1245         static int completed = 0;
1246
1247         xfr = libusb_alloc_transfer(0);
1248         libusb_fill_control_setup(cmdbuf,
1249             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1250                 /*index=*/44, /*length=*/4);
1251         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1252         xfr->user_data = this;
1253         libusb_submit_transfer(xfr);
1254
1255         // set up an asynchronous transfer of register 24
1256         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1257         static int completed2 = 0;
1258
1259         xfr = libusb_alloc_transfer(0);
1260         libusb_fill_control_setup(cmdbuf2,
1261             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1262                 /*index=*/24, /*length=*/4);
1263         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1264         xfr->user_data = this;
1265         libusb_submit_transfer(xfr);
1266 #endif
1267
1268         // set up an asynchronous transfer of the register dump
1269         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1270         static int completed3 = 0;
1271
1272         xfr = libusb_alloc_transfer(0);
1273         libusb_fill_control_setup(cmdbuf3,
1274             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1275                 /*index=*/current_register, /*length=*/4);
1276         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1277         xfr->user_data = this;
1278         //libusb_submit_transfer(xfr);
1279
1280         //audiofp = fopen("audio.raw", "wb");
1281
1282         // set up isochronous transfers for audio and video
1283         for (int e = 3; e <= 4; ++e) {
1284                 //int num_transfers = (e == 3) ? 6 : 6;
1285                 int num_transfers = 6;
1286                 for (int i = 0; i < num_transfers; ++i) {
1287                         size_t buf_size;
1288                         int num_iso_pack, size;
1289                         if (e == 3) {
1290                                 // Allocate for minimum width (because that will give us the most
1291                                 // number of packets, so we don't need to reallocated, but we'll
1292                                 // default to 720p for the first frame.
1293                                 size = find_xfer_size_for_width(MIN_WIDTH);
1294                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1295                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1296                         } else {
1297                                 size = 0xc0;
1298                                 num_iso_pack = 80;
1299                                 buf_size = num_iso_pack * size;
1300                         }
1301                         int num_bytes = num_iso_pack * size;
1302                         assert(size_t(num_bytes) <= buf_size);
1303 #if LIBUSB_API_VERSION >= 0x01000105
1304                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1305 #else
1306                         uint8_t *buf = nullptr;
1307 #endif
1308                         if (buf == nullptr) {
1309                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1310 #if LIBUSB_API_VERSION >= 0x01000105
1311                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1312 #else
1313                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1314 #endif
1315                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1316                                 buf = new uint8_t[num_bytes];
1317                         }
1318
1319                         xfr = libusb_alloc_transfer(num_iso_pack);
1320                         if (!xfr) {
1321                                 fprintf(stderr, "oom\n");
1322                                 exit(1);
1323                         }
1324
1325                         int ep = LIBUSB_ENDPOINT_IN | e;
1326                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1327                                 num_iso_pack, cb_xfr, nullptr, 0);
1328                         libusb_set_iso_packet_lengths(xfr, size);
1329                         xfr->user_data = this;
1330
1331                         if (e == 3) {
1332                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1333                         }
1334
1335                         iso_xfrs.push_back(xfr);
1336                 }
1337         }
1338 }
1339
1340 void BMUSBCapture::start_bm_capture()
1341 {
1342         int i = 0;
1343         for (libusb_transfer *xfr : iso_xfrs) {
1344                 int rc = libusb_submit_transfer(xfr);
1345                 ++i;
1346                 if (rc < 0) {
1347                         //printf("num_bytes=%d\n", num_bytes);
1348                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1349                                 xfr->endpoint, i, libusb_error_name(rc));
1350                         exit(1);
1351                 }
1352         }
1353
1354
1355 #if 0
1356         libusb_release_interface(devh, 0);
1357 out:
1358         if (devh)
1359                 libusb_close(devh);
1360         libusb_exit(nullptr);
1361         return rc;
1362 #endif
1363 }
1364
1365 void BMUSBCapture::stop_dequeue_thread()
1366 {
1367         dequeue_thread_should_quit = true;
1368         queues_not_empty.notify_all();
1369         dequeue_thread.join();
1370 }
1371
1372 void BMUSBCapture::start_bm_thread()
1373 {
1374         // Devices leaving are discovered by seeing the isochronous packets
1375         // coming back with errors, so only care about devices joining.
1376         if (card_connected_callback != nullptr) {
1377                 if (libusb_hotplug_register_callback(
1378                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1379                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1380                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1381                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1382                         exit(1);
1383                 }
1384         }
1385
1386         should_quit = false;
1387         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1388 }
1389
1390 void BMUSBCapture::stop_bm_thread()
1391 {
1392         should_quit = true;
1393         usb_thread.join();
1394 }
1395
1396 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1397 {
1398         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1399         VideoMode auto_mode;
1400         auto_mode.name = "Autodetect";
1401         auto_mode.autodetect = true;
1402         return {{ 0, auto_mode }};
1403 }
1404
1405 uint32_t BMUSBCapture::get_current_video_mode() const
1406 {
1407         return 0;  // Matches get_available_video_modes().
1408 }
1409
1410 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1411 {
1412         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1413 }
1414
1415 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1416 {
1417         return {
1418                 { 0x00000000, "HDMI/SDI" },
1419                 { 0x02000000, "Component" },
1420                 { 0x04000000, "Composite" },
1421                 { 0x06000000, "S-video" }
1422         };
1423 }
1424
1425 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1426 {
1427         assert((video_input_id & ~0x06000000) == 0);
1428         current_video_input = video_input_id;
1429         update_capture_mode();
1430 }
1431
1432 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1433 {
1434         return {
1435                 { 0x00000000, "Embedded" },
1436                 { 0x10000000, "Analog" }
1437         };
1438 }
1439
1440 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1441 {
1442         assert((audio_input_id & ~0x10000000) == 0);
1443         current_audio_input = audio_input_id;
1444         update_capture_mode();
1445 }
1446
1447 void BMUSBCapture::update_capture_mode()
1448 {
1449         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1450         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1451         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1452
1453         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1454                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1455         if (rc < 0) {
1456                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1457                 exit(1);
1458         }
1459 }
1460
1461 }  // namespace bmusb