]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Release v0.4, and add a pkg-config file.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.4
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <sched.h>
17 #include <stdint.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #if HAS_MULTIVERSIONING
22 #include <immintrin.h>
23 #endif
24 #include "bmusb.h"
25
26 #include <algorithm>
27 #include <atomic>
28 #include <condition_variable>
29 #include <cstddef>
30 #include <cstdint>
31 #include <deque>
32 #include <functional>
33 #include <memory>
34 #include <mutex>
35 #include <stack>
36 #include <string>
37 #include <thread>
38
39 using namespace std;
40 using namespace std::placeholders;
41
42 #define USB_VENDOR_BLACKMAGIC 0x1edb
43 #define MIN_WIDTH 640
44 #define HEADER_SIZE 44
45 //#define HEADER_SIZE 0
46 #define AUDIO_HEADER_SIZE 4
47
48 #define FRAME_SIZE (8 << 20)  // 8 MB.
49 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
50
51 namespace bmusb {
52
53 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
54
55 namespace {
56
57 FILE *audiofp;
58
59 thread usb_thread;
60 atomic<bool> should_quit;
61
62 int find_xfer_size_for_width(int width)
63 {
64         // Video seems to require isochronous packets scaled with the width;
65         // seemingly six lines is about right, rounded up to the required 1kB
66         // multiple.
67         int size = width * 2 * 6;
68         // Note that for 10-bit input, you'll need to increase size accordingly.
69         //size = size * 4 / 3;
70         if (size % 1024 != 0) {
71                 size &= ~1023;
72                 size += 1024;
73         }
74         return size;
75 }
76
77 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
78 {
79         assert(width >= MIN_WIDTH);
80         size_t size = find_xfer_size_for_width(width);
81         int num_iso_pack = xfr->length / size;
82         if (num_iso_pack != xfr->num_iso_packets ||
83             size != xfr->iso_packet_desc[0].length) {
84                 xfr->num_iso_packets = num_iso_pack;
85                 libusb_set_iso_packet_lengths(xfr, size);
86         }
87 }
88
89 struct VideoFormatEntry {
90         uint16_t normalized_video_format;
91         unsigned width, height, second_field_start;
92         unsigned extra_lines_top, extra_lines_bottom;
93         unsigned frame_rate_nom, frame_rate_den;
94         bool interlaced;
95 };
96
97 // Get details for the given video format; returns false if detection was incomplete.
98 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
99 {
100         decoded_video_format->id = video_format;
101         decoded_video_format->interlaced = false;
102
103         // TODO: Add these for all formats as we find them.
104         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
105
106         if (video_format == 0x0800) {
107                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
108                 // It's a strange thing, but what can you do.
109                 decoded_video_format->width = 720;
110                 decoded_video_format->height = 525;
111                 decoded_video_format->extra_lines_top = 0;
112                 decoded_video_format->extra_lines_bottom = 0;
113                 decoded_video_format->frame_rate_nom = 3013;
114                 decoded_video_format->frame_rate_den = 100;
115                 decoded_video_format->has_signal = false;
116                 return true;
117         }
118         if ((video_format & 0xe800) != 0xe800) {
119                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
120                         video_format);
121                 decoded_video_format->width = 0;
122                 decoded_video_format->height = 0;
123                 decoded_video_format->extra_lines_top = 0;
124                 decoded_video_format->extra_lines_bottom = 0;
125                 decoded_video_format->frame_rate_nom = 60;
126                 decoded_video_format->frame_rate_den = 1;
127                 decoded_video_format->has_signal = false;
128                 return false;
129         }
130
131         decoded_video_format->has_signal = true;
132
133         // NTSC (480i59.94, I suppose). A special case, see below.
134         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
135                 decoded_video_format->width = 720;
136                 decoded_video_format->height = 480;
137                 decoded_video_format->extra_lines_top = 17;
138                 decoded_video_format->extra_lines_bottom = 28;
139                 decoded_video_format->frame_rate_nom = 30000;
140                 decoded_video_format->frame_rate_den = 1001;
141                 decoded_video_format->second_field_start = 280;
142                 decoded_video_format->interlaced = true;
143                 return true;
144         }
145
146         // PAL (576i50, I suppose). A special case, see below.
147         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
148                 decoded_video_format->width = 720;
149                 decoded_video_format->height = 576;
150                 decoded_video_format->extra_lines_top = 22;
151                 decoded_video_format->extra_lines_bottom = 27;
152                 decoded_video_format->frame_rate_nom = 25;
153                 decoded_video_format->frame_rate_den = 1;
154                 decoded_video_format->second_field_start = 335;
155                 decoded_video_format->interlaced = true;
156                 return true;
157         }
158
159         // 0x8 seems to be a flag about availability of deep color on the input,
160         // except when it's not (e.g. it's the only difference between NTSC
161         // and PAL). Rather confusing. But we clear it here nevertheless, because
162         // usually it doesn't mean anything.
163         //
164         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
165         uint16_t normalized_video_format = video_format & ~0xe80c;
166         constexpr VideoFormatEntry entries[] = {
167                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
168                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
169                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
170                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
171                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
172                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
173                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
174                 { 0x01c3, 1920, 1080,   0,  0,  0,    30,    1, false },  // 1080p30.
175                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
176                 { 0x01e1, 1920, 1080,   0,  0,  0, 30000, 1001, false },  // 1080p29.97.
177                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
178                 { 0x0063, 1920, 1080,   0,  0,  0,    25,    1, false },  // 1080p25.
179                 { 0x0043, 1920, 1080,   0,  0,  0,    25,    1,  true },  // 1080p50.
180                 { 0x008e, 1920, 1080,   0,  0,  0,    24,    1, false },  // 1080p24.
181                 { 0x00a1, 1920, 1080,   0,  0,  0, 24000, 1001, false },  // 1080p23.98.
182         };
183         for (const VideoFormatEntry &entry : entries) {
184                 if (normalized_video_format == entry.normalized_video_format) {
185                         decoded_video_format->width = entry.width;
186                         decoded_video_format->height = entry.height;
187                         decoded_video_format->second_field_start = entry.second_field_start;
188                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
189                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
190                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
191                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
192                         decoded_video_format->interlaced = entry.interlaced;
193                         return true;
194                 }
195         }
196
197         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
198         decoded_video_format->width = 1280;
199         decoded_video_format->height = 720;
200         decoded_video_format->frame_rate_nom = 60;
201         decoded_video_format->frame_rate_den = 1;
202         return false;
203 }
204
205 }  // namespace
206
207 FrameAllocator::~FrameAllocator() {}
208
209 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
210         : frame_size(frame_size)
211 {
212         for (size_t i = 0; i < num_queued_frames; ++i) {
213                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
214         }
215 }
216
217 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
218 {
219         Frame vf;
220         vf.owner = this;
221
222         unique_lock<mutex> lock(freelist_mutex);  // Meh.
223         if (freelist.empty()) {
224                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
225                         frame_size);
226         } else {
227                 vf.data = freelist.top().release();
228                 vf.size = frame_size;
229                 freelist.pop();  // Meh.
230         }
231         return vf;
232 }
233
234 void MallocFrameAllocator::release_frame(Frame frame)
235 {
236         if (frame.overflow > 0) {
237                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
238         }
239         unique_lock<mutex> lock(freelist_mutex);
240         freelist.push(unique_ptr<uint8_t[]>(frame.data));
241 }
242
243 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
244 {
245         if (a == b) {
246                 return false;
247         } else if (a < b) {
248                 return (b - a < 0x8000);
249         } else {
250                 int wrap_b = 0x10000 + int(b);
251                 return (wrap_b - a < 0x8000);
252         }
253 }
254
255 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
256 {
257         unique_lock<mutex> lock(queue_lock);
258         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
259                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
260                         q->back().timecode, timecode);
261                 frame.owner->release_frame(frame);
262                 return;
263         }
264
265         QueuedFrame qf;
266         qf.format = format;
267         qf.timecode = timecode;
268         qf.frame = frame;
269         q->push_back(move(qf));
270         queues_not_empty.notify_one();  // might be spurious
271 }
272
273 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
274 {
275         FILE *fp = fopen(filename, "wb");
276         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
277                 printf("short write!\n");
278         }
279         fclose(fp);
280 }
281
282 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
283 {
284         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
285 }
286
287 void BMUSBCapture::dequeue_thread_func()
288 {
289         if (has_dequeue_callbacks) {
290                 dequeue_init_callback();
291         }
292         while (!dequeue_thread_should_quit) {
293                 unique_lock<mutex> lock(queue_lock);
294                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
295
296                 if (dequeue_thread_should_quit) break;
297
298                 uint16_t video_timecode = pending_video_frames.front().timecode;
299                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
300                 AudioFormat audio_format;
301                 audio_format.bits_per_sample = 24;
302                 audio_format.num_channels = 8;
303                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
304                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
305                                 video_timecode);
306                         QueuedFrame video_frame = pending_video_frames.front();
307                         pending_video_frames.pop_front();
308                         lock.unlock();
309                         video_frame_allocator->release_frame(video_frame.frame);
310                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
311                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
312                                 audio_timecode);
313                         QueuedFrame audio_frame = pending_audio_frames.front();
314                         pending_audio_frames.pop_front();
315                         lock.unlock();
316                         audio_format.id = audio_frame.format;
317
318                         // Use the video format of the pending frame.
319                         QueuedFrame video_frame = pending_video_frames.front();
320                         VideoFormat video_format;
321                         decode_video_format(video_frame.format, &video_format);
322
323                         frame_callback(audio_timecode,
324                                        FrameAllocator::Frame(), 0, video_format,
325                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
326                 } else {
327                         QueuedFrame video_frame = pending_video_frames.front();
328                         QueuedFrame audio_frame = pending_audio_frames.front();
329                         pending_audio_frames.pop_front();
330                         pending_video_frames.pop_front();
331                         lock.unlock();
332
333 #if 0
334                         char filename[255];
335                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
336                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
337                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
338 #endif
339
340                         VideoFormat video_format;
341                         audio_format.id = audio_frame.format;
342                         if (decode_video_format(video_frame.format, &video_format)) {
343                                 frame_callback(video_timecode,
344                                                video_frame.frame, HEADER_SIZE, video_format,
345                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
346                         } else {
347                                 frame_callback(video_timecode,
348                                                FrameAllocator::Frame(), 0, video_format,
349                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
350                         }
351                 }
352         }
353         if (has_dequeue_callbacks) {
354                 dequeue_cleanup_callback();
355         }
356 }
357
358 void BMUSBCapture::start_new_frame(const uint8_t *start)
359 {
360         uint16_t format = (start[3] << 8) | start[2];
361         uint16_t timecode = (start[1] << 8) | start[0];
362
363         if (current_video_frame.len > 0) {
364                 // If format is 0x0800 (no signal), add a fake (empty) audio
365                 // frame to get it out of the queue.
366                 // TODO: Figure out if there are other formats that come with
367                 // no audio, and treat them the same.
368                 if (format == 0x0800) {
369                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
370                         if (fake_audio_frame.data == nullptr) {
371                                 // Oh well, it's just a no-signal frame anyway.
372                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
373                                 current_video_frame.owner->release_frame(current_video_frame);
374                                 current_video_frame = video_frame_allocator->alloc_frame();
375                                 return;
376                         }
377                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
378                 }
379                 //dump_frame();
380                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
381
382                 // Update the assumed frame width. We might be one frame too late on format changes,
383                 // but it's much better than asking the user to choose manually.
384                 VideoFormat video_format;
385                 if (decode_video_format(format, &video_format)) {
386                         assumed_frame_width = video_format.width;
387                 }
388         }
389         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
390         //      format, timecode,
391         //      //start[7], start[6], start[5], start[4],
392         //      read_current_frame, FRAME_SIZE);
393
394         current_video_frame = video_frame_allocator->alloc_frame();
395         //if (current_video_frame.data == nullptr) {
396         //      read_current_frame = -1;
397         //} else {
398         //      read_current_frame = 0;
399         //}
400 }
401
402 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
403 {
404         uint16_t format = (start[3] << 8) | start[2];
405         uint16_t timecode = (start[1] << 8) | start[0];
406         if (current_audio_frame.len > 0) {
407                 //dump_audio_block();
408                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
409         }
410         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
411         //      format, timecode, read_current_audio_block);
412         current_audio_frame = audio_frame_allocator->alloc_frame();
413 }
414
415 #if 0
416 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
417 {
418         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
419         for (unsigned j = 0; j < pack->actual_length; j++) {
420         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
421                 printf("%02x", xfr->buffer[j + offset]);
422                 if ((j % 16) == 15)
423                         printf("\n");
424                 else if ((j % 8) == 7)
425                         printf("  ");
426                 else
427                         printf(" ");
428         }
429 }
430 #endif
431
432 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
433 {
434         assert(n % 2 == 0);
435         uint8_t *dptr1 = dest1;
436         uint8_t *dptr2 = dest2;
437
438         for (size_t i = 0; i < n; i += 2) {
439                 *dptr1++ = *src++;
440                 *dptr2++ = *src++;
441         }
442 }
443
444 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
445 {
446         if (current_frame->data == nullptr ||
447             current_frame->len > current_frame->size ||
448             start == end) {
449                 return;
450         }
451
452         int bytes = end - start;
453         if (current_frame->len + bytes > current_frame->size) {
454                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
455                 current_frame->len = current_frame->size;
456                 if (current_frame->overflow > 1048576) {
457                         printf("%d bytes overflow after last %s frame\n",
458                                 int(current_frame->overflow), frame_type_name);
459                         current_frame->overflow = 0;
460                 }
461                 //dump_frame();
462         } else {
463                 if (current_frame->interleaved) {
464                         uint8_t *data = current_frame->data + current_frame->len / 2;
465                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
466                         if (current_frame->len % 2 == 1) {
467                                 ++data;
468                                 swap(data, data2);
469                         }
470                         if (bytes % 2 == 1) {
471                                 *data++ = *start++;
472                                 swap(data, data2);
473                                 ++current_frame->len;
474                                 --bytes;
475                         }
476                         memcpy_interleaved(data, data2, start, bytes);
477                         current_frame->len += bytes;
478                 } else {
479                         memcpy(current_frame->data + current_frame->len, start, bytes);
480                         current_frame->len += bytes;
481                 }
482         }
483 }
484
485 #if 0
486 void avx2_dump(const char *name, __m256i n)
487 {
488         printf("%-10s:", name);
489         printf(" %02x", _mm256_extract_epi8(n, 0));
490         printf(" %02x", _mm256_extract_epi8(n, 1));
491         printf(" %02x", _mm256_extract_epi8(n, 2));
492         printf(" %02x", _mm256_extract_epi8(n, 3));
493         printf(" %02x", _mm256_extract_epi8(n, 4));
494         printf(" %02x", _mm256_extract_epi8(n, 5));
495         printf(" %02x", _mm256_extract_epi8(n, 6));
496         printf(" %02x", _mm256_extract_epi8(n, 7));
497         printf(" ");
498         printf(" %02x", _mm256_extract_epi8(n, 8));
499         printf(" %02x", _mm256_extract_epi8(n, 9));
500         printf(" %02x", _mm256_extract_epi8(n, 10));
501         printf(" %02x", _mm256_extract_epi8(n, 11));
502         printf(" %02x", _mm256_extract_epi8(n, 12));
503         printf(" %02x", _mm256_extract_epi8(n, 13));
504         printf(" %02x", _mm256_extract_epi8(n, 14));
505         printf(" %02x", _mm256_extract_epi8(n, 15));
506         printf(" ");
507         printf(" %02x", _mm256_extract_epi8(n, 16));
508         printf(" %02x", _mm256_extract_epi8(n, 17));
509         printf(" %02x", _mm256_extract_epi8(n, 18));
510         printf(" %02x", _mm256_extract_epi8(n, 19));
511         printf(" %02x", _mm256_extract_epi8(n, 20));
512         printf(" %02x", _mm256_extract_epi8(n, 21));
513         printf(" %02x", _mm256_extract_epi8(n, 22));
514         printf(" %02x", _mm256_extract_epi8(n, 23));
515         printf(" ");
516         printf(" %02x", _mm256_extract_epi8(n, 24));
517         printf(" %02x", _mm256_extract_epi8(n, 25));
518         printf(" %02x", _mm256_extract_epi8(n, 26));
519         printf(" %02x", _mm256_extract_epi8(n, 27));
520         printf(" %02x", _mm256_extract_epi8(n, 28));
521         printf(" %02x", _mm256_extract_epi8(n, 29));
522         printf(" %02x", _mm256_extract_epi8(n, 30));
523         printf(" %02x", _mm256_extract_epi8(n, 31));
524         printf("\n");
525 }
526 #endif
527
528 #ifndef HAS_MULTIVERSIONING
529
530 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
531 {
532         // No fast path possible unless we have multiversioning.
533         return start;
534 }
535
536 #else  // defined(HAS_MULTIVERSIONING)
537
538 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
539
540 // Does a memcpy and memchr in one to reduce processing time.
541 // Note that the benefit is somewhat limited if your L3 cache is small,
542 // as you'll (unfortunately) spend most of the time loading the data
543 // from main memory.
544 //
545 // Complicated cases are left to the slow path; it basically stops copying
546 // up until the first instance of "sync_char" (usually a bit before, actually).
547 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
548 // data, and what we really need this for is the 00 00 ff ff marker in video data.
549 __attribute__((target("default")))
550 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
551 {
552         // No fast path possible unless we have SSE 4.1 or higher.
553         return start;
554 }
555
556 __attribute__((target("sse4.1", "avx2")))
557 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
558 {
559         if (current_frame->data == nullptr ||
560             current_frame->len > current_frame->size ||
561             start == limit) {
562                 return start;
563         }
564         size_t orig_bytes = limit - start;
565         if (orig_bytes < 128) {
566                 // Don't bother.
567                 return start;
568         }
569
570         // Don't read more bytes than we can write.
571         limit = min(limit, start + (current_frame->size - current_frame->len));
572
573         // Align end to 32 bytes.
574         limit = (const uint8_t *)(intptr_t(limit) & ~31);
575
576         if (start >= limit) {
577                 return start;
578         }
579
580         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
581         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
582         if (aligned_start != start) {
583                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
584                 if (sync_start == nullptr) {
585                         add_to_frame(current_frame, "", start, aligned_start);
586                 } else {
587                         add_to_frame(current_frame, "", start, sync_start);
588                         return sync_start;
589                 }
590         }
591
592         // Make the length a multiple of 64.
593         if (current_frame->interleaved) {
594                 if (((limit - aligned_start) % 64) != 0) {
595                         limit -= 32;
596                 }
597                 assert(((limit - aligned_start) % 64) == 0);
598         }
599
600         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
601 }
602
603 __attribute__((target("avx2")))
604 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
605 {
606         const __m256i needle = _mm256_set1_epi8(sync_char);
607
608         const __restrict __m256i *in = (const __m256i *)aligned_start;
609         if (current_frame->interleaved) {
610                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
611                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
612                 if (current_frame->len % 2 == 1) {
613                         swap(out1, out2);
614                 }
615
616                 __m256i shuffle_cw = _mm256_set_epi8(
617                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
618                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
619                 while (in < (const __m256i *)limit) {
620                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
621                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
622                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
623
624                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
625                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
626                         __m256i found = _mm256_or_si256(found1, found2);
627
628                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
629                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
630                 
631                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
632                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
633
634                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
635                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
636
637                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
638                         _mm256_storeu_si256(out2, hi);
639
640                         if (!_mm256_testz_si256(found, found)) {
641                                 break;
642                         }
643
644                         in += 2;
645                         ++out1;
646                         ++out2;
647                 }
648                 current_frame->len += (uint8_t *)in - aligned_start;
649         } else {
650                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
651                 while (in < (const __m256i *)limit) {
652                         __m256i data = _mm256_load_si256(in);
653                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
654                         __m256i found = _mm256_cmpeq_epi8(data, needle);
655                         if (!_mm256_testz_si256(found, found)) {
656                                 break;
657                         }
658
659                         ++in;
660                         ++out;
661                 }
662                 current_frame->len = (uint8_t *)out - current_frame->data;
663         }
664
665         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
666         return (const uint8_t *)in;
667 }
668
669 __attribute__((target("sse4.1")))
670 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
671 {
672         const __m128i needle = _mm_set1_epi8(sync_char);
673
674         const __m128i *in = (const __m128i *)aligned_start;
675         if (current_frame->interleaved) {
676                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
677                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
678                 if (current_frame->len % 2 == 1) {
679                         swap(out1, out2);
680                 }
681
682                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
683                 while (in < (const __m128i *)limit) {
684                         __m128i data1 = _mm_load_si128(in);
685                         __m128i data2 = _mm_load_si128(in + 1);
686                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
687                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
688                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
689                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
690                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
691                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
692                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
693                         _mm_storeu_si128(out2, hi);
694                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
695                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
696                         if (!_mm_testz_si128(found1, found1) ||
697                             !_mm_testz_si128(found2, found2)) {
698                                 break;
699                         }
700
701                         in += 2;
702                         ++out1;
703                         ++out2;
704                 }
705                 current_frame->len += (uint8_t *)in - aligned_start;
706         } else {
707                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
708                 while (in < (const __m128i *)limit) {
709                         __m128i data = _mm_load_si128(in);
710                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
711                         __m128i found = _mm_cmpeq_epi8(data, needle);
712                         if (!_mm_testz_si128(found, found)) {
713                                 break;
714                         }
715
716                         ++in;
717                         ++out;
718                 }
719                 current_frame->len = (uint8_t *)out - current_frame->data;
720         }
721
722         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
723         return (const uint8_t *)in;
724 }
725
726 #endif  // defined(HAS_MULTIVERSIONING)
727
728 void decode_packs(const libusb_transfer *xfr,
729                   const char *sync_pattern,
730                   int sync_length,
731                   FrameAllocator::Frame *current_frame,
732                   const char *frame_type_name,
733                   function<void(const uint8_t *start)> start_callback)
734 {
735         int offset = 0;
736         for (int i = 0; i < xfr->num_iso_packets; i++) {
737                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
738
739                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
740                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
741                         continue;
742 //exit(5);
743                 }
744
745                 const uint8_t *start = xfr->buffer + offset;
746                 const uint8_t *limit = start + pack->actual_length;
747                 while (start < limit) {  // Usually runs only one iteration.
748                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
749                         if (start == limit) break;
750                         assert(start < limit);
751
752                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
753                         if (start_next_frame == nullptr) {
754                                 // add the rest of the buffer
755                                 add_to_frame(current_frame, frame_type_name, start, limit);
756                                 break;
757                         } else {
758                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
759                                 start = start_next_frame + sync_length;  // skip sync
760                                 start_callback(start);
761                         }
762                 }
763 #if 0
764                 dump_pack(xfr, offset, pack);
765 #endif
766                 offset += pack->length;
767         }
768 }
769
770 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
771 {
772         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
773             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
774                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
775                 libusb_free_transfer(xfr);
776                 exit(3);
777         }
778
779         assert(xfr->user_data != nullptr);
780         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
781
782         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
783                 if (!usb->disconnected) {
784                         fprintf(stderr, "Device went away, stopping transfers.\n");
785                         usb->disconnected = true;
786                         if (usb->card_disconnected_callback) {
787                                 usb->card_disconnected_callback();
788                         }
789                 }
790                 // Don't reschedule the transfer; the loop will stop by itself.
791                 return;
792         }
793
794         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
795                 if (xfr->endpoint == 0x84) {
796                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
797                 } else {
798                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
799
800                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
801                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
802                 }
803         }
804         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
805                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
806                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
807 #if 0
808                 if (setup->wIndex == 44) {
809                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
810                 } else {
811                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
812                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
813                 }
814 #else
815                 memcpy(usb->register_file + usb->current_register, buf, 4);
816                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
817                 if (usb->current_register == 0) {
818                         // read through all of them
819                         printf("register dump:");
820                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
821                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
822                         }
823                         printf("\n");
824                 }
825                 libusb_fill_control_setup(xfr->buffer,
826                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
827                         /*index=*/usb->current_register, /*length=*/4);
828 #endif
829         }
830
831 #if 0
832         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
833         for (i = 0; i < xfr->actual_length; i++) {
834                 printf("%02x", xfr->buffer[i]);
835                 if (i % 16)
836                         printf("\n");
837                 else if (i % 8)
838                         printf("  ");
839                 else
840                         printf(" ");
841         }
842 #endif
843
844         int rc = libusb_submit_transfer(xfr);
845         if (rc < 0) {
846                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
847                 exit(1);
848         }
849 }
850
851 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
852 {
853         if (card_connected_callback != nullptr) {
854                 libusb_device_descriptor desc;
855                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
856                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
857                         libusb_unref_device(dev);
858                         return 1;
859                 }
860
861                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
862                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
863                         card_connected_callback(dev);  // Callback takes ownership.
864                         return 0;
865                 }
866         }
867         libusb_unref_device(dev);
868         return 0;
869 }
870
871 void BMUSBCapture::usb_thread_func()
872 {
873         sched_param param;
874         memset(&param, 0, sizeof(param));
875         param.sched_priority = 1;
876         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
877                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
878         }
879         while (!should_quit) {
880                 int rc = libusb_handle_events(nullptr);
881                 if (rc != LIBUSB_SUCCESS)
882                         break;
883         }
884 }
885
886 struct USBCardDevice {
887         uint16_t product;
888         uint8_t bus, port;
889         libusb_device *device;
890 };
891
892 const char *get_product_name(uint16_t product)
893 {
894         if (product == 0xbd3b) {
895                 return "Intensity Shuttle";
896         } else if (product == 0xbd4f) {
897                 return "UltraStudio SDI";
898         } else {
899                 assert(false);
900                 return nullptr;
901         }
902 }
903
904 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
905 {
906         const char *product_name = get_product_name(product);
907
908         char buf[256];
909         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
910                 id, bus, port, product_name);
911         return buf;
912 }
913
914 libusb_device_handle *open_card(int card_index, string *description)
915 {
916         libusb_device **devices;
917         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
918         if (num_devices == -1) {
919                 fprintf(stderr, "Error finding USB devices\n");
920                 exit(1);
921         }
922         vector<USBCardDevice> found_cards;
923         for (ssize_t i = 0; i < num_devices; ++i) {
924                 libusb_device_descriptor desc;
925                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
926                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
927                         exit(1);
928                 }
929
930                 uint8_t bus = libusb_get_bus_number(devices[i]);
931                 uint8_t port = libusb_get_port_number(devices[i]);
932
933                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
934                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
935                         libusb_unref_device(devices[i]);
936                         continue;
937                 }
938
939                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
940         }
941         libusb_free_device_list(devices, 0);
942
943         // Sort the devices to get a consistent ordering.
944         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
945                 if (a.product != b.product)
946                         return a.product < b.product;
947                 if (a.bus != b.bus)
948                         return a.bus < b.bus;
949                 return a.port < b.port;
950         });
951
952         for (size_t i = 0; i < found_cards.size(); ++i) {
953                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
954                 fprintf(stderr, "%s\n", tmp_description.c_str());
955                 if (i == size_t(card_index)) {
956                         *description = tmp_description;
957                 }
958         }
959
960         if (size_t(card_index) >= found_cards.size()) {
961                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
962                 exit(1);
963         }
964
965         libusb_device_handle *devh;
966         int rc = libusb_open(found_cards[card_index].device, &devh);
967         if (rc < 0) {
968                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
969                 exit(1);
970         }
971
972         for (size_t i = 0; i < found_cards.size(); ++i) {
973                 libusb_unref_device(found_cards[i].device);
974         }
975
976         return devh;
977 }
978
979 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
980 {
981         uint8_t bus = libusb_get_bus_number(dev);
982         uint8_t port = libusb_get_port_number(dev);
983
984         libusb_device_descriptor desc;
985         if (libusb_get_device_descriptor(dev, &desc) < 0) {
986                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
987                 exit(1);
988         }
989
990         *description = get_card_description(card_index, bus, port, desc.idProduct);
991
992         libusb_device_handle *devh;
993         int rc = libusb_open(dev, &devh);
994         if (rc < 0) {
995                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
996                 exit(1);
997         }
998
999         return devh;
1000 }
1001
1002 void BMUSBCapture::configure_card()
1003 {
1004         if (video_frame_allocator == nullptr) {
1005                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1006                 set_video_frame_allocator(owned_video_frame_allocator.get());
1007         }
1008         if (audio_frame_allocator == nullptr) {
1009                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1010                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1011         }
1012         dequeue_thread_should_quit = false;
1013         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1014
1015         int rc;
1016         struct libusb_transfer *xfr;
1017
1018         rc = libusb_init(nullptr);
1019         if (rc < 0) {
1020                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1021                 exit(1);
1022         }
1023
1024         if (dev == nullptr) {
1025                 devh = open_card(card_index, &description);
1026         } else {
1027                 devh = open_card(card_index, dev, &description);
1028                 libusb_unref_device(dev);
1029         }
1030         if (!devh) {
1031                 fprintf(stderr, "Error finding USB device\n");
1032                 exit(1);
1033         }
1034
1035         libusb_config_descriptor *config;
1036         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1037         if (rc < 0) {
1038                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1039                 exit(1);
1040         }
1041
1042 #if 0
1043         printf("%d interface\n", config->bNumInterfaces);
1044         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1045                 printf("  interface %d\n", interface_number);
1046                 const libusb_interface *interface = &config->interface[interface_number];
1047                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1048                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1049                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1050                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1051                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1052                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1053                         }
1054                 }
1055         }
1056 #endif
1057
1058         rc = libusb_set_configuration(devh, /*configuration=*/1);
1059         if (rc < 0) {
1060                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1061                 exit(1);
1062         }
1063
1064         rc = libusb_claim_interface(devh, 0);
1065         if (rc < 0) {
1066                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1067                 exit(1);
1068         }
1069
1070         // Alternate setting 1 is output, alternate setting 2 is input.
1071         // Card is reset when switching alternates, so the driver uses
1072         // this “double switch” when it wants to reset.
1073         //
1074         // There's also alternate settings 3 and 4, which seem to be
1075         // like 1 and 2 except they advertise less bandwidth needed.
1076         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1077         if (rc < 0) {
1078                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1079                 exit(1);
1080         }
1081         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1082         if (rc < 0) {
1083                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1084                 exit(1);
1085         }
1086 #if 0
1087         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1088         if (rc < 0) {
1089                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1090                 exit(1);
1091         }
1092 #endif
1093
1094 #if 0
1095         rc = libusb_claim_interface(devh, 3);
1096         if (rc < 0) {
1097                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1098                 exit(1);
1099         }
1100 #endif
1101
1102         // theories:
1103         //   44 is some kind of timer register (first 16 bits count upwards)
1104         //   24 is some sort of watchdog?
1105         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1106         //      (or will go to 0x73c60010?), also seen 0x73c60100
1107         //   12 also changes all the time, unclear why  
1108         //   16 seems to be autodetected mode somehow
1109         //      --    this is e00115e0 after reset?
1110         //                    ed0115e0 after mode change [to output?]
1111         //                    2d0015e0 after more mode change [to input]
1112         //                    ed0115e0 after more mode change
1113         //                    2d0015e0 after more mode change
1114         //
1115         //                    390115e0 seems to indicate we have signal
1116         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1117         //
1118         //                    200015e0 on startup
1119         //         changes to 250115e0 when we sync to the signal
1120         //
1121         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1122         //
1123         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1124         //
1125         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1126         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1127         //
1128         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1129         //    perhaps some of them are related to analog output?
1130         //
1131         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1132         //    but the driver sets it to 0x8036802a at some point.
1133         //
1134         //    all of this is on request 214/215. other requests (192, 219,
1135         //    222, 223, 224) are used for firmware upgrade. Probably best to
1136         //    stay out of it unless you know what you're doing.
1137         //
1138         //
1139         // register 16:
1140         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1141         //
1142         // theories:
1143         //   0x01 - stable signal
1144         //   0x04 - deep color
1145         //   0x08 - unknown (audio??)
1146         //   0x20 - 720p??
1147         //   0x30 - 576p??
1148
1149         update_capture_mode();
1150
1151         struct ctrl {
1152                 int endpoint;
1153                 int request;
1154                 int index;
1155                 uint32_t data;
1156         };
1157         static const ctrl ctrls[] = {
1158                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1159                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1160
1161                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1162                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1163                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1164                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1165         };
1166
1167         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1168                 uint32_t flipped = htonl(ctrls[req].data);
1169                 static uint8_t value[4];
1170                 memcpy(value, &flipped, sizeof(flipped));
1171                 int size = sizeof(value);
1172                 //if (ctrls[req].request == 215) size = 0;
1173                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1174                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1175                 if (rc < 0) {
1176                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1177                         exit(1);
1178                 }
1179
1180                 if (ctrls[req].index == 16 && rc == 4) {
1181                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1182                 }
1183
1184 #if 0
1185                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1186                 for (int i = 0; i < rc; ++i) {
1187                         printf("%02x", value[i]);
1188                 }
1189                 printf("\n");
1190 #endif
1191         }
1192
1193 #if 0
1194         // DEBUG
1195         for ( ;; ) {
1196                 static int my_index = 0;
1197                 static uint8_t value[4];
1198                 int size = sizeof(value);
1199                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1200                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1201                 if (rc < 0) {
1202                         fprintf(stderr, "Error on control\n");
1203                         exit(1);
1204                 }
1205                 printf("rc=%d index=%d: 0x", rc, my_index);
1206                 for (int i = 0; i < rc; ++i) {
1207                         printf("%02x", value[i]);
1208                 }
1209                 printf("\n");
1210         }
1211 #endif
1212
1213 #if 0
1214         // set up an asynchronous transfer of the timer register
1215         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1216         static int completed = 0;
1217
1218         xfr = libusb_alloc_transfer(0);
1219         libusb_fill_control_setup(cmdbuf,
1220             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1221                 /*index=*/44, /*length=*/4);
1222         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1223         xfr->user_data = this;
1224         libusb_submit_transfer(xfr);
1225
1226         // set up an asynchronous transfer of register 24
1227         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1228         static int completed2 = 0;
1229
1230         xfr = libusb_alloc_transfer(0);
1231         libusb_fill_control_setup(cmdbuf2,
1232             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1233                 /*index=*/24, /*length=*/4);
1234         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1235         xfr->user_data = this;
1236         libusb_submit_transfer(xfr);
1237 #endif
1238
1239         // set up an asynchronous transfer of the register dump
1240         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1241         static int completed3 = 0;
1242
1243         xfr = libusb_alloc_transfer(0);
1244         libusb_fill_control_setup(cmdbuf3,
1245             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1246                 /*index=*/current_register, /*length=*/4);
1247         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1248         xfr->user_data = this;
1249         //libusb_submit_transfer(xfr);
1250
1251         //audiofp = fopen("audio.raw", "wb");
1252
1253         // set up isochronous transfers for audio and video
1254         for (int e = 3; e <= 4; ++e) {
1255                 //int num_transfers = (e == 3) ? 6 : 6;
1256                 int num_transfers = 6;
1257                 for (int i = 0; i < num_transfers; ++i) {
1258                         size_t buf_size;
1259                         int num_iso_pack, size;
1260                         if (e == 3) {
1261                                 // Allocate for minimum width (because that will give us the most
1262                                 // number of packets, so we don't need to reallocated, but we'll
1263                                 // default to 720p for the first frame.
1264                                 size = find_xfer_size_for_width(MIN_WIDTH);
1265                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1266                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1267                         } else {
1268                                 size = 0xc0;
1269                                 num_iso_pack = 80;
1270                                 buf_size = num_iso_pack * size;
1271                         }
1272                         int num_bytes = num_iso_pack * size;
1273                         assert(size_t(num_bytes) <= buf_size);
1274 #if LIBUSB_API_VERSION >= 0x01000105
1275                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1276 #else
1277                         uint8_t *buf = nullptr;
1278 #endif
1279                         if (buf == nullptr) {
1280                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1281 #if LIBUSB_API_VERSION >= 0x01000105
1282                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1283 #else
1284                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1285 #endif
1286                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1287                                 buf = new uint8_t[num_bytes];
1288                         }
1289
1290                         xfr = libusb_alloc_transfer(num_iso_pack);
1291                         if (!xfr) {
1292                                 fprintf(stderr, "oom\n");
1293                                 exit(1);
1294                         }
1295
1296                         int ep = LIBUSB_ENDPOINT_IN | e;
1297                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1298                                 num_iso_pack, cb_xfr, nullptr, 0);
1299                         libusb_set_iso_packet_lengths(xfr, size);
1300                         xfr->user_data = this;
1301
1302                         if (e == 3) {
1303                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1304                         }
1305
1306                         iso_xfrs.push_back(xfr);
1307                 }
1308         }
1309 }
1310
1311 void BMUSBCapture::start_bm_capture()
1312 {
1313         int i = 0;
1314         for (libusb_transfer *xfr : iso_xfrs) {
1315                 int rc = libusb_submit_transfer(xfr);
1316                 ++i;
1317                 if (rc < 0) {
1318                         //printf("num_bytes=%d\n", num_bytes);
1319                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1320                                 xfr->endpoint, i, libusb_error_name(rc));
1321                         exit(1);
1322                 }
1323         }
1324
1325
1326 #if 0
1327         libusb_release_interface(devh, 0);
1328 out:
1329         if (devh)
1330                 libusb_close(devh);
1331         libusb_exit(nullptr);
1332         return rc;
1333 #endif
1334 }
1335
1336 void BMUSBCapture::stop_dequeue_thread()
1337 {
1338         dequeue_thread_should_quit = true;
1339         queues_not_empty.notify_all();
1340         dequeue_thread.join();
1341 }
1342
1343 void BMUSBCapture::start_bm_thread()
1344 {
1345         // Devices leaving are discovered by seeing the isochronous packets
1346         // coming back with errors, so only care about devices joining.
1347         if (card_connected_callback != nullptr) {
1348                 if (libusb_hotplug_register_callback(
1349                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, LIBUSB_HOTPLUG_NO_FLAGS,
1350                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1351                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1352                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1353                         exit(1);
1354                 }
1355         }
1356
1357         should_quit = false;
1358         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1359 }
1360
1361 void BMUSBCapture::stop_bm_thread()
1362 {
1363         should_quit = true;
1364         usb_thread.join();
1365 }
1366
1367 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1368 {
1369         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1370         VideoMode auto_mode;
1371         auto_mode.name = "Autodetect";
1372         auto_mode.autodetect = true;
1373         return {{ 0, auto_mode }};
1374 }
1375
1376 uint32_t BMUSBCapture::get_current_video_mode() const
1377 {
1378         return 0;  // Matches get_available_video_modes().
1379 }
1380
1381 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1382 {
1383         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1384 }
1385
1386 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1387 {
1388         return {
1389                 { 0x00000000, "HDMI/SDI" },
1390                 { 0x02000000, "Component" },
1391                 { 0x04000000, "Composite" },
1392                 { 0x06000000, "S-video" }
1393         };
1394 }
1395
1396 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1397 {
1398         assert((video_input_id & ~0x06000000) == 0);
1399         current_video_input = video_input_id;
1400         update_capture_mode();
1401 }
1402
1403 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1404 {
1405         return {
1406                 { 0x00000000, "Embedded" },
1407                 { 0x10000000, "Analog" }
1408         };
1409 }
1410
1411 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1412 {
1413         assert((audio_input_id & ~0x10000000) == 0);
1414         current_audio_input = audio_input_id;
1415         update_capture_mode();
1416 }
1417
1418 void BMUSBCapture::update_capture_mode()
1419 {
1420         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1421         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1422         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1423
1424         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1425                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1426         if (rc < 0) {
1427                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1428                 exit(1);
1429         }
1430 }
1431
1432 }  // namespace bmusb