]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Use GCC multiversionining instead of #ifdefs and -march=native.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <sched.h>
17 #include <stdint.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #if HAS_MULTIVERSIONING
22 #include <immintrin.h>
23 #endif
24 #include "bmusb.h"
25
26 #include <algorithm>
27 #include <atomic>
28 #include <condition_variable>
29 #include <cstddef>
30 #include <cstdint>
31 #include <deque>
32 #include <functional>
33 #include <memory>
34 #include <mutex>
35 #include <stack>
36 #include <string>
37 #include <thread>
38
39 using namespace std;
40 using namespace std::placeholders;
41
42 #define USB_VENDOR_BLACKMAGIC 0x1edb
43 #define MIN_WIDTH 640
44 #define HEADER_SIZE 44
45 //#define HEADER_SIZE 0
46 #define AUDIO_HEADER_SIZE 4
47
48 #define FRAME_SIZE (8 << 20)  // 8 MB.
49 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
50
51 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
52
53 namespace {
54
55 FILE *audiofp;
56
57 thread usb_thread;
58 atomic<bool> should_quit;
59
60 int find_xfer_size_for_width(int width)
61 {
62         // Video seems to require isochronous packets scaled with the width;
63         // seemingly six lines is about right, rounded up to the required 1kB
64         // multiple.
65         int size = width * 2 * 6;
66         // Note that for 10-bit input, you'll need to increase size accordingly.
67         //size = size * 4 / 3;
68         if (size % 1024 != 0) {
69                 size &= ~1023;
70                 size += 1024;
71         }
72         return size;
73 }
74
75 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
76 {
77         assert(width >= MIN_WIDTH);
78         size_t size = find_xfer_size_for_width(width);
79         int num_iso_pack = xfr->length / size;
80         if (num_iso_pack != xfr->num_iso_packets ||
81             size != xfr->iso_packet_desc[0].length) {
82                 xfr->num_iso_packets = num_iso_pack;
83                 libusb_set_iso_packet_lengths(xfr, size);
84         }
85 }
86
87 }  // namespace
88
89 FrameAllocator::~FrameAllocator() {}
90
91 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
92         : frame_size(frame_size)
93 {
94         for (size_t i = 0; i < num_queued_frames; ++i) {
95                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
96         }
97 }
98
99 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
100 {
101         Frame vf;
102         vf.owner = this;
103
104         unique_lock<mutex> lock(freelist_mutex);  // Meh.
105         if (freelist.empty()) {
106                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
107                         frame_size);
108         } else {
109                 vf.data = freelist.top().release();
110                 vf.size = frame_size;
111                 freelist.pop();  // Meh.
112         }
113         return vf;
114 }
115
116 void MallocFrameAllocator::release_frame(Frame frame)
117 {
118         if (frame.overflow > 0) {
119                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
120         }
121         unique_lock<mutex> lock(freelist_mutex);
122         freelist.push(unique_ptr<uint8_t[]>(frame.data));
123 }
124
125 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
126 {
127         if (a == b) {
128                 return false;
129         } else if (a < b) {
130                 return (b - a < 0x8000);
131         } else {
132                 int wrap_b = 0x10000 + int(b);
133                 return (wrap_b - a < 0x8000);
134         }
135 }
136
137 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
138 {
139         unique_lock<mutex> lock(queue_lock);
140         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
141                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
142                         q->back().timecode, timecode);
143                 frame.owner->release_frame(frame);
144                 return;
145         }
146
147         QueuedFrame qf;
148         qf.format = format;
149         qf.timecode = timecode;
150         qf.frame = frame;
151         q->push_back(move(qf));
152         queues_not_empty.notify_one();  // might be spurious
153 }
154
155 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
156 {
157         FILE *fp = fopen(filename, "wb");
158         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
159                 printf("short write!\n");
160         }
161         fclose(fp);
162 }
163
164 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
165 {
166         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
167 }
168
169 void BMUSBCapture::dequeue_thread_func()
170 {
171         if (has_dequeue_callbacks) {
172                 dequeue_init_callback();
173         }
174         while (!dequeue_thread_should_quit) {
175                 unique_lock<mutex> lock(queue_lock);
176                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
177
178                 if (dequeue_thread_should_quit) break;
179
180                 uint16_t video_timecode = pending_video_frames.front().timecode;
181                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
182                 AudioFormat audio_format;
183                 audio_format.bits_per_sample = 24;
184                 audio_format.num_channels = 8;
185                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
186                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
187                                 video_timecode);
188                         QueuedFrame video_frame = pending_video_frames.front();
189                         pending_video_frames.pop_front();
190                         lock.unlock();
191                         video_frame_allocator->release_frame(video_frame.frame);
192                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
193                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
194                                 audio_timecode);
195                         QueuedFrame audio_frame = pending_audio_frames.front();
196                         pending_audio_frames.pop_front();
197                         lock.unlock();
198                         audio_format.id = audio_frame.format;
199
200                         // Use the video format of the pending frame.
201                         QueuedFrame video_frame = pending_video_frames.front();
202                         VideoFormat video_format;
203                         decode_video_format(video_frame.format, &video_format);
204
205                         frame_callback(audio_timecode,
206                                        FrameAllocator::Frame(), 0, video_format,
207                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
208                 } else {
209                         QueuedFrame video_frame = pending_video_frames.front();
210                         QueuedFrame audio_frame = pending_audio_frames.front();
211                         pending_audio_frames.pop_front();
212                         pending_video_frames.pop_front();
213                         lock.unlock();
214
215 #if 0
216                         char filename[255];
217                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
218                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
219                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
220 #endif
221
222                         VideoFormat video_format;
223                         audio_format.id = audio_frame.format;
224                         if (decode_video_format(video_frame.format, &video_format)) {
225                                 frame_callback(video_timecode,
226                                                video_frame.frame, HEADER_SIZE, video_format,
227                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
228                         } else {
229                                 frame_callback(video_timecode,
230                                                FrameAllocator::Frame(), 0, video_format,
231                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
232                         }
233                 }
234         }
235         if (has_dequeue_callbacks) {
236                 dequeue_cleanup_callback();
237         }
238 }
239
240 void BMUSBCapture::start_new_frame(const uint8_t *start)
241 {
242         uint16_t format = (start[3] << 8) | start[2];
243         uint16_t timecode = (start[1] << 8) | start[0];
244
245         if (current_video_frame.len > 0) {
246                 // If format is 0x0800 (no signal), add a fake (empty) audio
247                 // frame to get it out of the queue.
248                 // TODO: Figure out if there are other formats that come with
249                 // no audio, and treat them the same.
250                 if (format == 0x0800) {
251                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
252                         if (fake_audio_frame.data == nullptr) {
253                                 // Oh well, it's just a no-signal frame anyway.
254                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
255                                 current_video_frame.owner->release_frame(current_video_frame);
256                                 current_video_frame = video_frame_allocator->alloc_frame();
257                                 return;
258                         }
259                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
260                 }
261                 //dump_frame();
262                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
263
264                 // Update the assumed frame width. We might be one frame too late on format changes,
265                 // but it's much better than asking the user to choose manually.
266                 VideoFormat video_format;
267                 if (decode_video_format(format, &video_format)) {
268                         assumed_frame_width = video_format.width;
269                 }
270         }
271         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
272         //      format, timecode,
273         //      //start[7], start[6], start[5], start[4],
274         //      read_current_frame, FRAME_SIZE);
275
276         current_video_frame = video_frame_allocator->alloc_frame();
277         //if (current_video_frame.data == nullptr) {
278         //      read_current_frame = -1;
279         //} else {
280         //      read_current_frame = 0;
281         //}
282 }
283
284 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
285 {
286         uint16_t format = (start[3] << 8) | start[2];
287         uint16_t timecode = (start[1] << 8) | start[0];
288         if (current_audio_frame.len > 0) {
289                 //dump_audio_block();
290                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
291         }
292         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
293         //      format, timecode, read_current_audio_block);
294         current_audio_frame = audio_frame_allocator->alloc_frame();
295 }
296
297 #if 0
298 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
299 {
300         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
301         for (unsigned j = 0; j < pack->actual_length; j++) {
302         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
303                 printf("%02x", xfr->buffer[j + offset]);
304                 if ((j % 16) == 15)
305                         printf("\n");
306                 else if ((j % 8) == 7)
307                         printf("  ");
308                 else
309                         printf(" ");
310         }
311 }
312 #endif
313
314 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
315 {
316         assert(n % 2 == 0);
317         uint8_t *dptr1 = dest1;
318         uint8_t *dptr2 = dest2;
319
320         for (size_t i = 0; i < n; i += 2) {
321                 *dptr1++ = *src++;
322                 *dptr2++ = *src++;
323         }
324 }
325
326 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
327 {
328         if (current_frame->data == nullptr ||
329             current_frame->len > current_frame->size ||
330             start == end) {
331                 return;
332         }
333
334         int bytes = end - start;
335         if (current_frame->len + bytes > current_frame->size) {
336                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
337                 current_frame->len = current_frame->size;
338                 if (current_frame->overflow > 1048576) {
339                         printf("%d bytes overflow after last %s frame\n",
340                                 int(current_frame->overflow), frame_type_name);
341                         current_frame->overflow = 0;
342                 }
343                 //dump_frame();
344         } else {
345                 if (current_frame->interleaved) {
346                         uint8_t *data = current_frame->data + current_frame->len / 2;
347                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
348                         if (current_frame->len % 2 == 1) {
349                                 ++data;
350                                 swap(data, data2);
351                         }
352                         if (bytes % 2 == 1) {
353                                 *data++ = *start++;
354                                 swap(data, data2);
355                                 ++current_frame->len;
356                                 --bytes;
357                         }
358                         memcpy_interleaved(data, data2, start, bytes);
359                         current_frame->len += bytes;
360                 } else {
361                         memcpy(current_frame->data + current_frame->len, start, bytes);
362                         current_frame->len += bytes;
363                 }
364         }
365 }
366
367 #if 0
368 void avx2_dump(const char *name, __m256i n)
369 {
370         printf("%-10s:", name);
371         printf(" %02x", _mm256_extract_epi8(n, 0));
372         printf(" %02x", _mm256_extract_epi8(n, 1));
373         printf(" %02x", _mm256_extract_epi8(n, 2));
374         printf(" %02x", _mm256_extract_epi8(n, 3));
375         printf(" %02x", _mm256_extract_epi8(n, 4));
376         printf(" %02x", _mm256_extract_epi8(n, 5));
377         printf(" %02x", _mm256_extract_epi8(n, 6));
378         printf(" %02x", _mm256_extract_epi8(n, 7));
379         printf(" ");
380         printf(" %02x", _mm256_extract_epi8(n, 8));
381         printf(" %02x", _mm256_extract_epi8(n, 9));
382         printf(" %02x", _mm256_extract_epi8(n, 10));
383         printf(" %02x", _mm256_extract_epi8(n, 11));
384         printf(" %02x", _mm256_extract_epi8(n, 12));
385         printf(" %02x", _mm256_extract_epi8(n, 13));
386         printf(" %02x", _mm256_extract_epi8(n, 14));
387         printf(" %02x", _mm256_extract_epi8(n, 15));
388         printf(" ");
389         printf(" %02x", _mm256_extract_epi8(n, 16));
390         printf(" %02x", _mm256_extract_epi8(n, 17));
391         printf(" %02x", _mm256_extract_epi8(n, 18));
392         printf(" %02x", _mm256_extract_epi8(n, 19));
393         printf(" %02x", _mm256_extract_epi8(n, 20));
394         printf(" %02x", _mm256_extract_epi8(n, 21));
395         printf(" %02x", _mm256_extract_epi8(n, 22));
396         printf(" %02x", _mm256_extract_epi8(n, 23));
397         printf(" ");
398         printf(" %02x", _mm256_extract_epi8(n, 24));
399         printf(" %02x", _mm256_extract_epi8(n, 25));
400         printf(" %02x", _mm256_extract_epi8(n, 26));
401         printf(" %02x", _mm256_extract_epi8(n, 27));
402         printf(" %02x", _mm256_extract_epi8(n, 28));
403         printf(" %02x", _mm256_extract_epi8(n, 29));
404         printf(" %02x", _mm256_extract_epi8(n, 30));
405         printf(" %02x", _mm256_extract_epi8(n, 31));
406         printf("\n");
407 }
408 #endif
409
410 #ifndef HAS_MULTIVERSIONING
411
412 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
413 {
414         // No fast path possible unless we have multiversioning.
415         return start;
416 }
417
418 #else  // defined(HAS_MULTIVERSIONING)
419
420 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
421
422 // Does a memcpy and memchr in one to reduce processing time.
423 // Note that the benefit is somewhat limited if your L3 cache is small,
424 // as you'll (unfortunately) spend most of the time loading the data
425 // from main memory.
426 //
427 // Complicated cases are left to the slow path; it basically stops copying
428 // up until the first instance of "sync_char" (usually a bit before, actually).
429 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
430 // data, and what we really need this for is the 00 00 ff ff marker in video data.
431 __attribute__((target("default")))
432 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
433 {
434         // No fast path possible unless we have SSE 4.1 or higher.
435         return start;
436 }
437
438 __attribute__((target("sse4.1", "avx2")))
439 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
440 {
441         if (current_frame->data == nullptr ||
442             current_frame->len > current_frame->size ||
443             start == limit) {
444                 return start;
445         }
446         size_t orig_bytes = limit - start;
447         if (orig_bytes < 128) {
448                 // Don't bother.
449                 return start;
450         }
451
452         // Don't read more bytes than we can write.
453         limit = min(limit, start + (current_frame->size - current_frame->len));
454
455         // Align end to 32 bytes.
456         limit = (const uint8_t *)(intptr_t(limit) & ~31);
457
458         if (start >= limit) {
459                 return start;
460         }
461
462         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
463         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
464         if (aligned_start != start) {
465                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
466                 if (sync_start == nullptr) {
467                         add_to_frame(current_frame, "", start, aligned_start);
468                 } else {
469                         add_to_frame(current_frame, "", start, sync_start);
470                         return sync_start;
471                 }
472         }
473
474         // Make the length a multiple of 64.
475         if (current_frame->interleaved) {
476                 if (((limit - aligned_start) % 64) != 0) {
477                         limit -= 32;
478                 }
479                 assert(((limit - aligned_start) % 64) == 0);
480         }
481
482         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
483 }
484
485 __attribute__((target("avx2")))
486 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
487 {
488         const __m256i needle = _mm256_set1_epi8(sync_char);
489
490         const __restrict __m256i *in = (const __m256i *)aligned_start;
491         if (current_frame->interleaved) {
492                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
493                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
494                 if (current_frame->len % 2 == 1) {
495                         swap(out1, out2);
496                 }
497
498                 __m256i shuffle_cw = _mm256_set_epi8(
499                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
500                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
501                 while (in < (const __m256i *)limit) {
502                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
503                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
504                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
505
506                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
507                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
508                         __m256i found = _mm256_or_si256(found1, found2);
509
510                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
511                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
512                 
513                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
514                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
515
516                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
517                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
518
519                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
520                         _mm256_storeu_si256(out2, hi);
521
522                         if (!_mm256_testz_si256(found, found)) {
523                                 break;
524                         }
525
526                         in += 2;
527                         ++out1;
528                         ++out2;
529                 }
530                 current_frame->len += (uint8_t *)in - aligned_start;
531         } else {
532                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
533                 while (in < (const __m256i *)limit) {
534                         __m256i data = _mm256_load_si256(in);
535                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
536                         __m256i found = _mm256_cmpeq_epi8(data, needle);
537                         if (!_mm256_testz_si256(found, found)) {
538                                 break;
539                         }
540
541                         ++in;
542                         ++out;
543                 }
544                 current_frame->len = (uint8_t *)out - current_frame->data;
545         }
546
547         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
548         return (const uint8_t *)in;
549 }
550
551 __attribute__((target("sse4.1")))
552 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
553 {
554         const __m128i needle = _mm_set1_epi8(sync_char);
555
556         const __m128i *in = (const __m128i *)aligned_start;
557         if (current_frame->interleaved) {
558                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
559                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
560                 if (current_frame->len % 2 == 1) {
561                         swap(out1, out2);
562                 }
563
564                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
565                 while (in < (const __m128i *)limit) {
566                         __m128i data1 = _mm_load_si128(in);
567                         __m128i data2 = _mm_load_si128(in + 1);
568                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
569                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
570                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
571                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
572                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
573                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
574                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
575                         _mm_storeu_si128(out2, hi);
576                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
577                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
578                         if (!_mm_testz_si128(found1, found1) ||
579                             !_mm_testz_si128(found2, found2)) {
580                                 break;
581                         }
582
583                         in += 2;
584                         ++out1;
585                         ++out2;
586                 }
587                 current_frame->len += (uint8_t *)in - aligned_start;
588         } else {
589                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
590                 while (in < (const __m128i *)limit) {
591                         __m128i data = _mm_load_si128(in);
592                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
593                         __m128i found = _mm_cmpeq_epi8(data, needle);
594                         if (!_mm_testz_si128(found, found)) {
595                                 break;
596                         }
597
598                         ++in;
599                         ++out;
600                 }
601                 current_frame->len = (uint8_t *)out - current_frame->data;
602         }
603
604         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
605         return (const uint8_t *)in;
606 }
607
608 #endif  // defined(HAS_MULTIVERSIONING)
609
610 void decode_packs(const libusb_transfer *xfr,
611                   const char *sync_pattern,
612                   int sync_length,
613                   FrameAllocator::Frame *current_frame,
614                   const char *frame_type_name,
615                   function<void(const uint8_t *start)> start_callback)
616 {
617         int offset = 0;
618         for (int i = 0; i < xfr->num_iso_packets; i++) {
619                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
620
621                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
622                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
623                         continue;
624 //exit(5);
625                 }
626
627                 const uint8_t *start = xfr->buffer + offset;
628                 const uint8_t *limit = start + pack->actual_length;
629                 while (start < limit) {  // Usually runs only one iteration.
630                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
631                         if (start == limit) break;
632                         assert(start < limit);
633
634                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
635                         if (start_next_frame == nullptr) {
636                                 // add the rest of the buffer
637                                 add_to_frame(current_frame, frame_type_name, start, limit);
638                                 break;
639                         } else {
640                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
641                                 start = start_next_frame + sync_length;  // skip sync
642                                 start_callback(start);
643                         }
644                 }
645 #if 0
646                 dump_pack(xfr, offset, pack);
647 #endif
648                 offset += pack->length;
649         }
650 }
651
652 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
653 {
654         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
655             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
656                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
657                 libusb_free_transfer(xfr);
658                 exit(3);
659         }
660
661         assert(xfr->user_data != nullptr);
662         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
663
664         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
665                 if (!usb->disconnected) {
666                         fprintf(stderr, "Device went away, stopping transfers.\n");
667                         usb->disconnected = true;
668                         if (usb->card_disconnected_callback) {
669                                 usb->card_disconnected_callback();
670                         }
671                 }
672                 // Don't reschedule the transfer; the loop will stop by itself.
673                 return;
674         }
675
676         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
677                 if (xfr->endpoint == 0x84) {
678                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
679                 } else {
680                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
681
682                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
683                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
684                 }
685         }
686         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
687                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
688                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
689 #if 0
690                 if (setup->wIndex == 44) {
691                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
692                 } else {
693                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
694                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
695                 }
696 #else
697                 memcpy(usb->register_file + usb->current_register, buf, 4);
698                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
699                 if (usb->current_register == 0) {
700                         // read through all of them
701                         printf("register dump:");
702                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
703                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
704                         }
705                         printf("\n");
706                 }
707                 libusb_fill_control_setup(xfr->buffer,
708                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
709                         /*index=*/usb->current_register, /*length=*/4);
710 #endif
711         }
712
713 #if 0
714         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
715         for (i = 0; i < xfr->actual_length; i++) {
716                 printf("%02x", xfr->buffer[i]);
717                 if (i % 16)
718                         printf("\n");
719                 else if (i % 8)
720                         printf("  ");
721                 else
722                         printf(" ");
723         }
724 #endif
725
726         int rc = libusb_submit_transfer(xfr);
727         if (rc < 0) {
728                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
729                 exit(1);
730         }
731 }
732
733 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
734 {
735         if (card_connected_callback != nullptr) {
736                 libusb_device_descriptor desc;
737                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
738                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
739                         libusb_unref_device(dev);
740                         return 1;
741                 }
742
743                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
744                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
745                         card_connected_callback(dev);  // Callback takes ownership.
746                         return 0;
747                 }
748         }
749         libusb_unref_device(dev);
750         return 0;
751 }
752
753 void BMUSBCapture::usb_thread_func()
754 {
755         sched_param param;
756         memset(&param, 0, sizeof(param));
757         param.sched_priority = 1;
758         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
759                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
760         }
761         while (!should_quit) {
762                 int rc = libusb_handle_events(nullptr);
763                 if (rc != LIBUSB_SUCCESS)
764                         break;
765         }
766 }
767
768 struct USBCardDevice {
769         uint16_t product;
770         uint8_t bus, port;
771         libusb_device *device;
772 };
773
774 const char *get_product_name(uint16_t product)
775 {
776         if (product == 0xbd3b) {
777                 return "Intensity Shuttle";
778         } else if (product == 0xbd4f) {
779                 return "UltraStudio SDI";
780         } else {
781                 assert(false);
782                 return nullptr;
783         }
784 }
785
786 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
787 {
788         const char *product_name = get_product_name(product);
789
790         char buf[256];
791         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
792                 id, bus, port, product_name);
793         return buf;
794 }
795
796 libusb_device_handle *open_card(int card_index, string *description)
797 {
798         libusb_device **devices;
799         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
800         if (num_devices == -1) {
801                 fprintf(stderr, "Error finding USB devices\n");
802                 exit(1);
803         }
804         vector<USBCardDevice> found_cards;
805         for (ssize_t i = 0; i < num_devices; ++i) {
806                 libusb_device_descriptor desc;
807                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
808                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
809                         exit(1);
810                 }
811
812                 uint8_t bus = libusb_get_bus_number(devices[i]);
813                 uint8_t port = libusb_get_port_number(devices[i]);
814
815                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
816                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
817                         libusb_unref_device(devices[i]);
818                         continue;
819                 }
820
821                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
822         }
823         libusb_free_device_list(devices, 0);
824
825         // Sort the devices to get a consistent ordering.
826         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
827                 if (a.product != b.product)
828                         return a.product < b.product;
829                 if (a.bus != b.bus)
830                         return a.bus < b.bus;
831                 return a.port < b.port;
832         });
833
834         for (size_t i = 0; i < found_cards.size(); ++i) {
835                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
836                 fprintf(stderr, "%s\n", tmp_description.c_str());
837                 if (i == size_t(card_index)) {
838                         *description = tmp_description;
839                 }
840         }
841
842         if (size_t(card_index) >= found_cards.size()) {
843                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
844                 exit(1);
845         }
846
847         libusb_device_handle *devh;
848         int rc = libusb_open(found_cards[card_index].device, &devh);
849         if (rc < 0) {
850                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
851                 exit(1);
852         }
853
854         for (size_t i = 0; i < found_cards.size(); ++i) {
855                 libusb_unref_device(found_cards[i].device);
856         }
857
858         return devh;
859 }
860
861 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
862 {
863         uint8_t bus = libusb_get_bus_number(dev);
864         uint8_t port = libusb_get_port_number(dev);
865
866         libusb_device_descriptor desc;
867         if (libusb_get_device_descriptor(dev, &desc) < 0) {
868                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
869                 exit(1);
870         }
871
872         *description = get_card_description(card_index, bus, port, desc.idProduct);
873
874         libusb_device_handle *devh;
875         int rc = libusb_open(dev, &devh);
876         if (rc < 0) {
877                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
878                 exit(1);
879         }
880
881         return devh;
882 }
883
884 void BMUSBCapture::configure_card()
885 {
886         if (video_frame_allocator == nullptr) {
887                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
888                 set_video_frame_allocator(owned_video_frame_allocator.get());
889         }
890         if (audio_frame_allocator == nullptr) {
891                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
892                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
893         }
894         dequeue_thread_should_quit = false;
895         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
896
897         int rc;
898         struct libusb_transfer *xfr;
899
900         rc = libusb_init(nullptr);
901         if (rc < 0) {
902                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
903                 exit(1);
904         }
905
906         if (dev == nullptr) {
907                 devh = open_card(card_index, &description);
908         } else {
909                 devh = open_card(card_index, dev, &description);
910                 libusb_unref_device(dev);
911         }
912         if (!devh) {
913                 fprintf(stderr, "Error finding USB device\n");
914                 exit(1);
915         }
916
917         libusb_config_descriptor *config;
918         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
919         if (rc < 0) {
920                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
921                 exit(1);
922         }
923
924 #if 0
925         printf("%d interface\n", config->bNumInterfaces);
926         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
927                 printf("  interface %d\n", interface_number);
928                 const libusb_interface *interface = &config->interface[interface_number];
929                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
930                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
931                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
932                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
933                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
934                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
935                         }
936                 }
937         }
938 #endif
939
940         rc = libusb_set_configuration(devh, /*configuration=*/1);
941         if (rc < 0) {
942                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
943                 exit(1);
944         }
945
946         rc = libusb_claim_interface(devh, 0);
947         if (rc < 0) {
948                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
949                 exit(1);
950         }
951
952         // Alternate setting 1 is output, alternate setting 2 is input.
953         // Card is reset when switching alternates, so the driver uses
954         // this “double switch” when it wants to reset.
955         //
956         // There's also alternate settings 3 and 4, which seem to be
957         // like 1 and 2 except they advertise less bandwidth needed.
958         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
959         if (rc < 0) {
960                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
961                 exit(1);
962         }
963         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
964         if (rc < 0) {
965                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
966                 exit(1);
967         }
968 #if 0
969         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
970         if (rc < 0) {
971                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
972                 exit(1);
973         }
974 #endif
975
976 #if 0
977         rc = libusb_claim_interface(devh, 3);
978         if (rc < 0) {
979                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
980                 exit(1);
981         }
982 #endif
983
984         // theories:
985         //   44 is some kind of timer register (first 16 bits count upwards)
986         //   24 is some sort of watchdog?
987         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
988         //      (or will go to 0x73c60010?), also seen 0x73c60100
989         //   12 also changes all the time, unclear why  
990         //   16 seems to be autodetected mode somehow
991         //      --    this is e00115e0 after reset?
992         //                    ed0115e0 after mode change [to output?]
993         //                    2d0015e0 after more mode change [to input]
994         //                    ed0115e0 after more mode change
995         //                    2d0015e0 after more mode change
996         //
997         //                    390115e0 seems to indicate we have signal
998         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
999         //
1000         //                    200015e0 on startup
1001         //         changes to 250115e0 when we sync to the signal
1002         //
1003         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1004         //
1005         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1006         //
1007         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1008         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1009         //
1010         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1011         //    perhaps some of them are related to analog output?
1012         //
1013         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1014         //    but the driver sets it to 0x8036802a at some point.
1015         //
1016         //    all of this is on request 214/215. other requests (192, 219,
1017         //    222, 223, 224) are used for firmware upgrade. Probably best to
1018         //    stay out of it unless you know what you're doing.
1019         //
1020         //
1021         // register 16:
1022         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1023         //
1024         // theories:
1025         //   0x01 - stable signal
1026         //   0x04 - deep color
1027         //   0x08 - unknown (audio??)
1028         //   0x20 - 720p??
1029         //   0x30 - 576p??
1030
1031         update_capture_mode();
1032
1033         struct ctrl {
1034                 int endpoint;
1035                 int request;
1036                 int index;
1037                 uint32_t data;
1038         };
1039         static const ctrl ctrls[] = {
1040                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1041                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1042
1043                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1044                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1045                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1046                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1047         };
1048
1049         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1050                 uint32_t flipped = htonl(ctrls[req].data);
1051                 static uint8_t value[4];
1052                 memcpy(value, &flipped, sizeof(flipped));
1053                 int size = sizeof(value);
1054                 //if (ctrls[req].request == 215) size = 0;
1055                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1056                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1057                 if (rc < 0) {
1058                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1059                         exit(1);
1060                 }
1061
1062                 if (ctrls[req].index == 16 && rc == 4) {
1063                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1064                 }
1065
1066 #if 0
1067                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1068                 for (int i = 0; i < rc; ++i) {
1069                         printf("%02x", value[i]);
1070                 }
1071                 printf("\n");
1072 #endif
1073         }
1074
1075 #if 0
1076         // DEBUG
1077         for ( ;; ) {
1078                 static int my_index = 0;
1079                 static uint8_t value[4];
1080                 int size = sizeof(value);
1081                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1082                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1083                 if (rc < 0) {
1084                         fprintf(stderr, "Error on control\n");
1085                         exit(1);
1086                 }
1087                 printf("rc=%d index=%d: 0x", rc, my_index);
1088                 for (int i = 0; i < rc; ++i) {
1089                         printf("%02x", value[i]);
1090                 }
1091                 printf("\n");
1092         }
1093 #endif
1094
1095 #if 0
1096         // set up an asynchronous transfer of the timer register
1097         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1098         static int completed = 0;
1099
1100         xfr = libusb_alloc_transfer(0);
1101         libusb_fill_control_setup(cmdbuf,
1102             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1103                 /*index=*/44, /*length=*/4);
1104         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1105         xfr->user_data = this;
1106         libusb_submit_transfer(xfr);
1107
1108         // set up an asynchronous transfer of register 24
1109         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1110         static int completed2 = 0;
1111
1112         xfr = libusb_alloc_transfer(0);
1113         libusb_fill_control_setup(cmdbuf2,
1114             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1115                 /*index=*/24, /*length=*/4);
1116         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1117         xfr->user_data = this;
1118         libusb_submit_transfer(xfr);
1119 #endif
1120
1121         // set up an asynchronous transfer of the register dump
1122         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1123         static int completed3 = 0;
1124
1125         xfr = libusb_alloc_transfer(0);
1126         libusb_fill_control_setup(cmdbuf3,
1127             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1128                 /*index=*/current_register, /*length=*/4);
1129         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1130         xfr->user_data = this;
1131         //libusb_submit_transfer(xfr);
1132
1133         //audiofp = fopen("audio.raw", "wb");
1134
1135         // set up isochronous transfers for audio and video
1136         for (int e = 3; e <= 4; ++e) {
1137                 //int num_transfers = (e == 3) ? 6 : 6;
1138                 int num_transfers = 6;
1139                 for (int i = 0; i < num_transfers; ++i) {
1140                         size_t buf_size;
1141                         int num_iso_pack, size;
1142                         if (e == 3) {
1143                                 // Allocate for minimum width (because that will give us the most
1144                                 // number of packets, so we don't need to reallocated, but we'll
1145                                 // default to 720p for the first frame.
1146                                 size = find_xfer_size_for_width(MIN_WIDTH);
1147                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1148                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1149                         } else {
1150                                 size = 0xc0;
1151                                 num_iso_pack = 80;
1152                                 buf_size = num_iso_pack * size;
1153                         }
1154                         int num_bytes = num_iso_pack * size;
1155                         assert(size_t(num_bytes) <= buf_size);
1156 #if LIBUSB_API_VERSION >= 0x01000105
1157                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1158 #else
1159                         uint8_t *buf = nullptr;
1160 #endif
1161                         if (buf == nullptr) {
1162                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1163 #if LIBUSB_API_VERSION >= 0x01000105
1164                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1165 #else
1166                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1167 #endif
1168                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1169                                 buf = new uint8_t[num_bytes];
1170                         }
1171
1172                         xfr = libusb_alloc_transfer(num_iso_pack);
1173                         if (!xfr) {
1174                                 fprintf(stderr, "oom\n");
1175                                 exit(1);
1176                         }
1177
1178                         int ep = LIBUSB_ENDPOINT_IN | e;
1179                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1180                                 num_iso_pack, cb_xfr, nullptr, 0);
1181                         libusb_set_iso_packet_lengths(xfr, size);
1182                         xfr->user_data = this;
1183
1184                         if (e == 3) {
1185                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1186                         }
1187
1188                         iso_xfrs.push_back(xfr);
1189                 }
1190         }
1191 }
1192
1193 void BMUSBCapture::start_bm_capture()
1194 {
1195         int i = 0;
1196         for (libusb_transfer *xfr : iso_xfrs) {
1197                 int rc = libusb_submit_transfer(xfr);
1198                 ++i;
1199                 if (rc < 0) {
1200                         //printf("num_bytes=%d\n", num_bytes);
1201                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1202                                 xfr->endpoint, i, libusb_error_name(rc));
1203                         exit(1);
1204                 }
1205         }
1206
1207
1208 #if 0
1209         libusb_release_interface(devh, 0);
1210 out:
1211         if (devh)
1212                 libusb_close(devh);
1213         libusb_exit(nullptr);
1214         return rc;
1215 #endif
1216 }
1217
1218 void BMUSBCapture::stop_dequeue_thread()
1219 {
1220         dequeue_thread_should_quit = true;
1221         queues_not_empty.notify_all();
1222         dequeue_thread.join();
1223 }
1224
1225 void BMUSBCapture::start_bm_thread()
1226 {
1227         // Devices leaving are discovered by seeing the isochronous packets
1228         // coming back with errors, so only care about devices joining.
1229         if (card_connected_callback != nullptr) {
1230                 if (libusb_hotplug_register_callback(
1231                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, LIBUSB_HOTPLUG_NO_FLAGS,
1232                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1233                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1234                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1235                         exit(1);
1236                 }
1237         }
1238
1239         should_quit = false;
1240         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1241 }
1242
1243 void BMUSBCapture::stop_bm_thread()
1244 {
1245         should_quit = true;
1246         usb_thread.join();
1247 }
1248
1249 struct VideoFormatEntry {
1250         uint16_t normalized_video_format;
1251         unsigned width, height, second_field_start;
1252         unsigned extra_lines_top, extra_lines_bottom;
1253         unsigned frame_rate_nom, frame_rate_den;
1254         bool interlaced;
1255 };
1256
1257 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
1258 {
1259         decoded_video_format->id = video_format;
1260         decoded_video_format->interlaced = false;
1261
1262         // TODO: Add these for all formats as we find them.
1263         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
1264
1265         if (video_format == 0x0800) {
1266                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
1267                 // It's a strange thing, but what can you do.
1268                 decoded_video_format->width = 720;
1269                 decoded_video_format->height = 525;
1270                 decoded_video_format->extra_lines_top = 0;
1271                 decoded_video_format->extra_lines_bottom = 0;
1272                 decoded_video_format->frame_rate_nom = 3013;
1273                 decoded_video_format->frame_rate_den = 100;
1274                 decoded_video_format->has_signal = false;
1275                 return true;
1276         }
1277         if ((video_format & 0xe800) != 0xe800) {
1278                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
1279                         video_format);
1280                 decoded_video_format->width = 0;
1281                 decoded_video_format->height = 0;
1282                 decoded_video_format->extra_lines_top = 0;
1283                 decoded_video_format->extra_lines_bottom = 0;
1284                 decoded_video_format->frame_rate_nom = 60;
1285                 decoded_video_format->frame_rate_den = 1;
1286                 decoded_video_format->has_signal = false;
1287                 return false;
1288         }
1289
1290         decoded_video_format->has_signal = true;
1291
1292         // NTSC (480i59.94, I suppose). A special case, see below.
1293         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
1294                 decoded_video_format->width = 720;
1295                 decoded_video_format->height = 480;
1296                 decoded_video_format->extra_lines_top = 17;
1297                 decoded_video_format->extra_lines_bottom = 28;
1298                 decoded_video_format->frame_rate_nom = 30000;
1299                 decoded_video_format->frame_rate_den = 1001;
1300                 decoded_video_format->second_field_start = 280;
1301                 decoded_video_format->interlaced = true;
1302                 return true;
1303         }
1304
1305         // PAL (576i50, I suppose). A special case, see below.
1306         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
1307                 decoded_video_format->width = 720;
1308                 decoded_video_format->height = 576;
1309                 decoded_video_format->extra_lines_top = 22;
1310                 decoded_video_format->extra_lines_bottom = 27;
1311                 decoded_video_format->frame_rate_nom = 25;
1312                 decoded_video_format->frame_rate_den = 1;
1313                 decoded_video_format->second_field_start = 335;
1314                 decoded_video_format->interlaced = true;
1315                 return true;
1316         }
1317
1318         // 0x8 seems to be a flag about availability of deep color on the input,
1319         // except when it's not (e.g. it's the only difference between NTSC
1320         // and PAL). Rather confusing. But we clear it here nevertheless, because
1321         // usually it doesn't mean anything.
1322         //
1323         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
1324         uint16_t normalized_video_format = video_format & ~0xe80c;
1325         constexpr VideoFormatEntry entries[] = {
1326                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
1327                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
1328                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
1329                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
1330                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
1331                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
1332                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
1333                 { 0x01c3, 1920, 1080,   0,  0,  0,    30,    1, false },  // 1080p30.
1334                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
1335                 { 0x01e1, 1920, 1080,   0,  0,  0, 30000, 1001, false },  // 1080p29.97.
1336                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
1337                 { 0x0063, 1920, 1080,   0,  0,  0,    25,    1, false },  // 1080p25.
1338                 { 0x0043, 1920, 1080,   0,  0,  0,    25,    1,  true },  // 1080p50.
1339                 { 0x008e, 1920, 1080,   0,  0,  0,    24,    1, false },  // 1080p24.
1340                 { 0x00a1, 1920, 1080,   0,  0,  0, 24000, 1001, false },  // 1080p23.98.
1341         };
1342         for (const VideoFormatEntry &entry : entries) {
1343                 if (normalized_video_format == entry.normalized_video_format) {
1344                         decoded_video_format->width = entry.width;
1345                         decoded_video_format->height = entry.height;
1346                         decoded_video_format->second_field_start = entry.second_field_start;
1347                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
1348                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
1349                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
1350                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
1351                         decoded_video_format->interlaced = entry.interlaced;
1352                         return true;
1353                 }
1354         }
1355
1356         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
1357         decoded_video_format->width = 1280;
1358         decoded_video_format->height = 720;
1359         decoded_video_format->frame_rate_nom = 60;
1360         decoded_video_format->frame_rate_den = 1;
1361         return false;
1362 }
1363
1364 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1365 {
1366         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1367         VideoMode auto_mode;
1368         auto_mode.name = "Autodetect";
1369         auto_mode.autodetect = true;
1370         return {{ 0, auto_mode }};
1371 }
1372
1373 uint32_t BMUSBCapture::get_current_video_mode() const
1374 {
1375         return 0;  // Matches get_available_video_modes().
1376 }
1377
1378 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1379 {
1380         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1381 }
1382
1383 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1384 {
1385         return {
1386                 { 0x00000000, "HDMI/SDI" },
1387                 { 0x02000000, "Component" },
1388                 { 0x04000000, "Composite" },
1389                 { 0x06000000, "S-video" }
1390         };
1391 }
1392
1393 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1394 {
1395         assert((video_input_id & ~0x06000000) == 0);
1396         current_video_input = video_input_id;
1397         update_capture_mode();
1398 }
1399
1400 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1401 {
1402         return {
1403                 { 0x00000000, "Embedded" },
1404                 { 0x10000000, "Analog" }
1405         };
1406 }
1407
1408 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1409 {
1410         assert((audio_input_id & ~0x10000000) == 0);
1411         current_audio_input = audio_input_id;
1412         update_capture_mode();
1413 }
1414
1415 void BMUSBCapture::update_capture_mode()
1416 {
1417         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1418         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1419         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1420
1421         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1422                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1423         if (rc < 0) {
1424                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1425                 exit(1);
1426         }
1427 }