]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Give all of our threads meaningful names, to aid with debugging.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.5.4
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdint.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #if HAS_MULTIVERSIONING
23 #include <immintrin.h>
24 #endif
25 #include "bmusb/bmusb.h"
26
27 #include <algorithm>
28 #include <atomic>
29 #include <chrono>
30 #include <condition_variable>
31 #include <cstddef>
32 #include <cstdint>
33 #include <deque>
34 #include <functional>
35 #include <memory>
36 #include <mutex>
37 #include <stack>
38 #include <string>
39 #include <thread>
40
41 using namespace std;
42 using namespace std::chrono;
43 using namespace std::placeholders;
44
45 #define USB_VENDOR_BLACKMAGIC 0x1edb
46 #define MIN_WIDTH 640
47 #define HEADER_SIZE 44
48 //#define HEADER_SIZE 0
49 #define AUDIO_HEADER_SIZE 4
50
51 #define FRAME_SIZE (8 << 20)  // 8 MB.
52 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
53
54 namespace bmusb {
55
56 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
57 bool BMUSBCapture::hotplug_existing_devices = false;
58
59 namespace {
60
61 FILE *audiofp;
62
63 thread usb_thread;
64 atomic<bool> should_quit;
65
66 int find_xfer_size_for_width(int width)
67 {
68         // Video seems to require isochronous packets scaled with the width;
69         // seemingly six lines is about right, rounded up to the required 1kB
70         // multiple.
71         int size = width * 2 * 6;
72         // Note that for 10-bit input, you'll need to increase size accordingly.
73         //size = size * 4 / 3;
74         if (size % 1024 != 0) {
75                 size &= ~1023;
76                 size += 1024;
77         }
78         return size;
79 }
80
81 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
82 {
83         assert(width >= MIN_WIDTH);
84         size_t size = find_xfer_size_for_width(width);
85         int num_iso_pack = xfr->length / size;
86         if (num_iso_pack != xfr->num_iso_packets ||
87             size != xfr->iso_packet_desc[0].length) {
88                 xfr->num_iso_packets = num_iso_pack;
89                 libusb_set_iso_packet_lengths(xfr, size);
90         }
91 }
92
93 struct VideoFormatEntry {
94         uint16_t normalized_video_format;
95         unsigned width, height, second_field_start;
96         unsigned extra_lines_top, extra_lines_bottom;
97         unsigned frame_rate_nom, frame_rate_den;
98         bool interlaced;
99 };
100
101 // Get details for the given video format; returns false if detection was incomplete.
102 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
103 {
104         decoded_video_format->id = video_format;
105         decoded_video_format->interlaced = false;
106
107         // TODO: Add these for all formats as we find them.
108         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
109
110         if (video_format == 0x0800) {
111                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
112                 // It's a strange thing, but what can you do.
113                 decoded_video_format->width = 720;
114                 decoded_video_format->height = 525;
115                 decoded_video_format->extra_lines_top = 0;
116                 decoded_video_format->extra_lines_bottom = 0;
117                 decoded_video_format->frame_rate_nom = 3013;
118                 decoded_video_format->frame_rate_den = 100;
119                 decoded_video_format->has_signal = false;
120                 return true;
121         }
122         if ((video_format & 0xe800) != 0xe800) {
123                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
124                         video_format);
125                 decoded_video_format->width = 0;
126                 decoded_video_format->height = 0;
127                 decoded_video_format->extra_lines_top = 0;
128                 decoded_video_format->extra_lines_bottom = 0;
129                 decoded_video_format->frame_rate_nom = 60;
130                 decoded_video_format->frame_rate_den = 1;
131                 decoded_video_format->has_signal = false;
132                 return false;
133         }
134
135         decoded_video_format->has_signal = true;
136
137         // NTSC (480i59.94, I suppose). A special case, see below.
138         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
139                 decoded_video_format->width = 720;
140                 decoded_video_format->height = 480;
141                 decoded_video_format->extra_lines_top = 17;
142                 decoded_video_format->extra_lines_bottom = 28;
143                 decoded_video_format->frame_rate_nom = 30000;
144                 decoded_video_format->frame_rate_den = 1001;
145                 decoded_video_format->second_field_start = 280;
146                 decoded_video_format->interlaced = true;
147                 return true;
148         }
149
150         // PAL (576i50, I suppose). A special case, see below.
151         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
152                 decoded_video_format->width = 720;
153                 decoded_video_format->height = 576;
154                 decoded_video_format->extra_lines_top = 22;
155                 decoded_video_format->extra_lines_bottom = 27;
156                 decoded_video_format->frame_rate_nom = 25;
157                 decoded_video_format->frame_rate_den = 1;
158                 decoded_video_format->second_field_start = 335;
159                 decoded_video_format->interlaced = true;
160                 return true;
161         }
162
163         // 0x8 seems to be a flag about availability of deep color on the input,
164         // except when it's not (e.g. it's the only difference between NTSC
165         // and PAL). Rather confusing. But we clear it here nevertheless, because
166         // usually it doesn't mean anything.
167         //
168         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
169         uint16_t normalized_video_format = video_format & ~0xe80c;
170         constexpr VideoFormatEntry entries[] = {
171                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
172                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
173                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
174                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
175                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
176                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
177                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
178                 { 0x01c3, 1920, 1080,   0, 20, 25,    30,    1, false },  // 1080p30.
179                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
180                 { 0x01e1, 1920, 1080,   0, 20, 25, 30000, 1001, false },  // 1080p29.97.
181                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
182                 { 0x0063, 1920, 1080,   0, 20, 25,    25,    1, false },  // 1080p25.
183                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
184                 { 0x0083, 1920, 1080,   0, 20, 25,    24,    1, false },  // 1080p24.
185                 { 0x00a1, 1920, 1080,   0, 20, 25, 24000, 1001, false },  // 1080p23.98.
186         };
187         for (const VideoFormatEntry &entry : entries) {
188                 if (normalized_video_format == entry.normalized_video_format) {
189                         decoded_video_format->width = entry.width;
190                         decoded_video_format->height = entry.height;
191                         decoded_video_format->second_field_start = entry.second_field_start;
192                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
193                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
194                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
195                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
196                         decoded_video_format->interlaced = entry.interlaced;
197                         return true;
198                 }
199         }
200
201         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
202         decoded_video_format->width = 1280;
203         decoded_video_format->height = 720;
204         decoded_video_format->frame_rate_nom = 60;
205         decoded_video_format->frame_rate_den = 1;
206         return false;
207 }
208
209 }  // namespace
210
211 FrameAllocator::~FrameAllocator() {}
212
213 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
214         : frame_size(frame_size)
215 {
216         for (size_t i = 0; i < num_queued_frames; ++i) {
217                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
218         }
219 }
220
221 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
222 {
223         Frame vf;
224         vf.owner = this;
225
226         unique_lock<mutex> lock(freelist_mutex);  // Meh.
227         if (freelist.empty()) {
228                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
229                         frame_size);
230         } else {
231                 vf.data = freelist.top().release();
232                 vf.size = frame_size;
233                 freelist.pop();  // Meh.
234         }
235         return vf;
236 }
237
238 void MallocFrameAllocator::release_frame(Frame frame)
239 {
240         if (frame.overflow > 0) {
241                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
242         }
243         unique_lock<mutex> lock(freelist_mutex);
244         freelist.push(unique_ptr<uint8_t[]>(frame.data));
245 }
246
247 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
248 {
249         if (a == b) {
250                 return false;
251         } else if (a < b) {
252                 return (b - a < 0x8000);
253         } else {
254                 int wrap_b = 0x10000 + int(b);
255                 return (wrap_b - a < 0x8000);
256         }
257 }
258
259 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
260 {
261         unique_lock<mutex> lock(queue_lock);
262         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
263                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
264                         q->back().timecode, timecode);
265                 frame.owner->release_frame(frame);
266                 return;
267         }
268
269         QueuedFrame qf;
270         qf.format = format;
271         qf.timecode = timecode;
272         qf.frame = frame;
273         q->push_back(move(qf));
274         queues_not_empty.notify_one();  // might be spurious
275 }
276
277 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
278 {
279         FILE *fp = fopen(filename, "wb");
280         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
281                 printf("short write!\n");
282         }
283         fclose(fp);
284 }
285
286 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
287 {
288         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
289 }
290
291 void BMUSBCapture::dequeue_thread_func()
292 {
293         char thread_name[16];
294         snprintf(thread_name, sizeof(thread_name), "bmusb_dequeue_%d", card_index);
295         pthread_setname_np(pthread_self(), thread_name);
296
297         if (has_dequeue_callbacks) {
298                 dequeue_init_callback();
299         }
300         while (!dequeue_thread_should_quit) {
301                 unique_lock<mutex> lock(queue_lock);
302                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
303
304                 if (dequeue_thread_should_quit) break;
305
306                 uint16_t video_timecode = pending_video_frames.front().timecode;
307                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
308                 AudioFormat audio_format;
309                 audio_format.bits_per_sample = 24;
310                 audio_format.num_channels = 8;
311                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
312                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
313                                 video_timecode);
314                         QueuedFrame video_frame = pending_video_frames.front();
315                         pending_video_frames.pop_front();
316                         lock.unlock();
317                         video_frame_allocator->release_frame(video_frame.frame);
318                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
319                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
320                                 audio_timecode);
321                         QueuedFrame audio_frame = pending_audio_frames.front();
322                         pending_audio_frames.pop_front();
323                         lock.unlock();
324                         audio_format.id = audio_frame.format;
325
326                         // Use the video format of the pending frame.
327                         QueuedFrame video_frame = pending_video_frames.front();
328                         VideoFormat video_format;
329                         decode_video_format(video_frame.format, &video_format);
330
331                         frame_callback(audio_timecode,
332                                        FrameAllocator::Frame(), 0, video_format,
333                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
334                 } else {
335                         QueuedFrame video_frame = pending_video_frames.front();
336                         QueuedFrame audio_frame = pending_audio_frames.front();
337                         pending_audio_frames.pop_front();
338                         pending_video_frames.pop_front();
339                         lock.unlock();
340
341 #if 0
342                         char filename[255];
343                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
344                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
345                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
346 #endif
347
348                         VideoFormat video_format;
349                         audio_format.id = audio_frame.format;
350                         if (decode_video_format(video_frame.format, &video_format)) {
351                                 frame_callback(video_timecode,
352                                                video_frame.frame, HEADER_SIZE, video_format,
353                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
354                         } else {
355                                 frame_callback(video_timecode,
356                                                FrameAllocator::Frame(), 0, video_format,
357                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
358                         }
359                 }
360         }
361         if (has_dequeue_callbacks) {
362                 dequeue_cleanup_callback();
363         }
364 }
365
366 void BMUSBCapture::start_new_frame(const uint8_t *start)
367 {
368         uint16_t format = (start[3] << 8) | start[2];
369         uint16_t timecode = (start[1] << 8) | start[0];
370
371         if (current_video_frame.len > 0) {
372                 current_video_frame.received_timestamp = steady_clock::now();
373
374                 // If format is 0x0800 (no signal), add a fake (empty) audio
375                 // frame to get it out of the queue.
376                 // TODO: Figure out if there are other formats that come with
377                 // no audio, and treat them the same.
378                 if (format == 0x0800) {
379                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
380                         if (fake_audio_frame.data == nullptr) {
381                                 // Oh well, it's just a no-signal frame anyway.
382                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
383                                 current_video_frame.owner->release_frame(current_video_frame);
384                                 current_video_frame = video_frame_allocator->alloc_frame();
385                                 return;
386                         }
387                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
388                 }
389                 //dump_frame();
390                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
391
392                 // Update the assumed frame width. We might be one frame too late on format changes,
393                 // but it's much better than asking the user to choose manually.
394                 VideoFormat video_format;
395                 if (decode_video_format(format, &video_format)) {
396                         assumed_frame_width = video_format.width;
397                 }
398         }
399         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
400         //      format, timecode,
401         //      //start[7], start[6], start[5], start[4],
402         //      read_current_frame, FRAME_SIZE);
403
404         current_video_frame = video_frame_allocator->alloc_frame();
405         //if (current_video_frame.data == nullptr) {
406         //      read_current_frame = -1;
407         //} else {
408         //      read_current_frame = 0;
409         //}
410 }
411
412 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
413 {
414         uint16_t format = (start[3] << 8) | start[2];
415         uint16_t timecode = (start[1] << 8) | start[0];
416         if (current_audio_frame.len > 0) {
417                 current_audio_frame.received_timestamp = steady_clock::now();
418                 //dump_audio_block();
419                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
420         }
421         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
422         //      format, timecode, read_current_audio_block);
423         current_audio_frame = audio_frame_allocator->alloc_frame();
424 }
425
426 #if 0
427 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
428 {
429         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
430         for (unsigned j = 0; j < pack->actual_length; j++) {
431         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
432                 printf("%02x", xfr->buffer[j + offset]);
433                 if ((j % 16) == 15)
434                         printf("\n");
435                 else if ((j % 8) == 7)
436                         printf("  ");
437                 else
438                         printf(" ");
439         }
440 }
441 #endif
442
443 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
444 {
445         assert(n % 2 == 0);
446         uint8_t *dptr1 = dest1;
447         uint8_t *dptr2 = dest2;
448
449         for (size_t i = 0; i < n; i += 2) {
450                 *dptr1++ = *src++;
451                 *dptr2++ = *src++;
452         }
453 }
454
455 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
456 {
457         if (current_frame->data == nullptr ||
458             current_frame->len > current_frame->size ||
459             start == end) {
460                 return;
461         }
462
463         int bytes = end - start;
464         if (current_frame->len + bytes > current_frame->size) {
465                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
466                 current_frame->len = current_frame->size;
467                 if (current_frame->overflow > 1048576) {
468                         printf("%d bytes overflow after last %s frame\n",
469                                 int(current_frame->overflow), frame_type_name);
470                         current_frame->overflow = 0;
471                 }
472                 //dump_frame();
473         } else {
474                 if (current_frame->interleaved) {
475                         uint8_t *data = current_frame->data + current_frame->len / 2;
476                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
477                         if (current_frame->len % 2 == 1) {
478                                 ++data;
479                                 swap(data, data2);
480                         }
481                         if (bytes % 2 == 1) {
482                                 *data++ = *start++;
483                                 swap(data, data2);
484                                 ++current_frame->len;
485                                 --bytes;
486                         }
487                         memcpy_interleaved(data, data2, start, bytes);
488                         current_frame->len += bytes;
489                 } else {
490                         memcpy(current_frame->data + current_frame->len, start, bytes);
491                         current_frame->len += bytes;
492                 }
493         }
494 }
495
496 #if 0
497 void avx2_dump(const char *name, __m256i n)
498 {
499         printf("%-10s:", name);
500         printf(" %02x", _mm256_extract_epi8(n, 0));
501         printf(" %02x", _mm256_extract_epi8(n, 1));
502         printf(" %02x", _mm256_extract_epi8(n, 2));
503         printf(" %02x", _mm256_extract_epi8(n, 3));
504         printf(" %02x", _mm256_extract_epi8(n, 4));
505         printf(" %02x", _mm256_extract_epi8(n, 5));
506         printf(" %02x", _mm256_extract_epi8(n, 6));
507         printf(" %02x", _mm256_extract_epi8(n, 7));
508         printf(" ");
509         printf(" %02x", _mm256_extract_epi8(n, 8));
510         printf(" %02x", _mm256_extract_epi8(n, 9));
511         printf(" %02x", _mm256_extract_epi8(n, 10));
512         printf(" %02x", _mm256_extract_epi8(n, 11));
513         printf(" %02x", _mm256_extract_epi8(n, 12));
514         printf(" %02x", _mm256_extract_epi8(n, 13));
515         printf(" %02x", _mm256_extract_epi8(n, 14));
516         printf(" %02x", _mm256_extract_epi8(n, 15));
517         printf(" ");
518         printf(" %02x", _mm256_extract_epi8(n, 16));
519         printf(" %02x", _mm256_extract_epi8(n, 17));
520         printf(" %02x", _mm256_extract_epi8(n, 18));
521         printf(" %02x", _mm256_extract_epi8(n, 19));
522         printf(" %02x", _mm256_extract_epi8(n, 20));
523         printf(" %02x", _mm256_extract_epi8(n, 21));
524         printf(" %02x", _mm256_extract_epi8(n, 22));
525         printf(" %02x", _mm256_extract_epi8(n, 23));
526         printf(" ");
527         printf(" %02x", _mm256_extract_epi8(n, 24));
528         printf(" %02x", _mm256_extract_epi8(n, 25));
529         printf(" %02x", _mm256_extract_epi8(n, 26));
530         printf(" %02x", _mm256_extract_epi8(n, 27));
531         printf(" %02x", _mm256_extract_epi8(n, 28));
532         printf(" %02x", _mm256_extract_epi8(n, 29));
533         printf(" %02x", _mm256_extract_epi8(n, 30));
534         printf(" %02x", _mm256_extract_epi8(n, 31));
535         printf("\n");
536 }
537 #endif
538
539 #ifndef HAS_MULTIVERSIONING
540
541 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
542 {
543         // No fast path possible unless we have multiversioning.
544         return start;
545 }
546
547 #else  // defined(HAS_MULTIVERSIONING)
548
549 __attribute__((target("sse4.1")))
550 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
551
552 __attribute__((target("avx2")))
553 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
554
555 // Does a memcpy and memchr in one to reduce processing time.
556 // Note that the benefit is somewhat limited if your L3 cache is small,
557 // as you'll (unfortunately) spend most of the time loading the data
558 // from main memory.
559 //
560 // Complicated cases are left to the slow path; it basically stops copying
561 // up until the first instance of "sync_char" (usually a bit before, actually).
562 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
563 // data, and what we really need this for is the 00 00 ff ff marker in video data.
564 __attribute__((target("default")))
565 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
566 {
567         // No fast path possible unless we have SSE 4.1 or higher.
568         return start;
569 }
570
571 __attribute__((target("sse4.1", "avx2")))
572 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
573 {
574         if (current_frame->data == nullptr ||
575             current_frame->len > current_frame->size ||
576             start == limit) {
577                 return start;
578         }
579         size_t orig_bytes = limit - start;
580         if (orig_bytes < 128) {
581                 // Don't bother.
582                 return start;
583         }
584
585         // Don't read more bytes than we can write.
586         limit = min(limit, start + (current_frame->size - current_frame->len));
587
588         // Align end to 32 bytes.
589         limit = (const uint8_t *)(intptr_t(limit) & ~31);
590
591         if (start >= limit) {
592                 return start;
593         }
594
595         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
596         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
597         if (aligned_start != start) {
598                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
599                 if (sync_start == nullptr) {
600                         add_to_frame(current_frame, "", start, aligned_start);
601                 } else {
602                         add_to_frame(current_frame, "", start, sync_start);
603                         return sync_start;
604                 }
605         }
606
607         // Make the length a multiple of 64.
608         if (current_frame->interleaved) {
609                 if (((limit - aligned_start) % 64) != 0) {
610                         limit -= 32;
611                 }
612                 assert(((limit - aligned_start) % 64) == 0);
613         }
614
615         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
616 }
617
618 __attribute__((target("avx2")))
619 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
620 {
621         const __m256i needle = _mm256_set1_epi8(sync_char);
622
623         const __restrict __m256i *in = (const __m256i *)aligned_start;
624         if (current_frame->interleaved) {
625                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
626                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
627                 if (current_frame->len % 2 == 1) {
628                         swap(out1, out2);
629                 }
630
631                 __m256i shuffle_cw = _mm256_set_epi8(
632                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
633                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
634                 while (in < (const __m256i *)limit) {
635                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
636                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
637                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
638
639                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
640                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
641                         __m256i found = _mm256_or_si256(found1, found2);
642
643                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
644                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
645                 
646                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
647                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
648
649                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
650                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
651
652                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
653                         _mm256_storeu_si256(out2, hi);
654
655                         if (!_mm256_testz_si256(found, found)) {
656                                 break;
657                         }
658
659                         in += 2;
660                         ++out1;
661                         ++out2;
662                 }
663                 current_frame->len += (uint8_t *)in - aligned_start;
664         } else {
665                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
666                 while (in < (const __m256i *)limit) {
667                         __m256i data = _mm256_load_si256(in);
668                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
669                         __m256i found = _mm256_cmpeq_epi8(data, needle);
670                         if (!_mm256_testz_si256(found, found)) {
671                                 break;
672                         }
673
674                         ++in;
675                         ++out;
676                 }
677                 current_frame->len = (uint8_t *)out - current_frame->data;
678         }
679
680         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
681         return (const uint8_t *)in;
682 }
683
684 __attribute__((target("sse4.1")))
685 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
686 {
687         const __m128i needle = _mm_set1_epi8(sync_char);
688
689         const __m128i *in = (const __m128i *)aligned_start;
690         if (current_frame->interleaved) {
691                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
692                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
693                 if (current_frame->len % 2 == 1) {
694                         swap(out1, out2);
695                 }
696
697                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
698                 while (in < (const __m128i *)limit) {
699                         __m128i data1 = _mm_load_si128(in);
700                         __m128i data2 = _mm_load_si128(in + 1);
701                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
702                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
703                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
704                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
705                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
706                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
707                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
708                         _mm_storeu_si128(out2, hi);
709                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
710                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
711                         if (!_mm_testz_si128(found1, found1) ||
712                             !_mm_testz_si128(found2, found2)) {
713                                 break;
714                         }
715
716                         in += 2;
717                         ++out1;
718                         ++out2;
719                 }
720                 current_frame->len += (uint8_t *)in - aligned_start;
721         } else {
722                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
723                 while (in < (const __m128i *)limit) {
724                         __m128i data = _mm_load_si128(in);
725                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
726                         __m128i found = _mm_cmpeq_epi8(data, needle);
727                         if (!_mm_testz_si128(found, found)) {
728                                 break;
729                         }
730
731                         ++in;
732                         ++out;
733                 }
734                 current_frame->len = (uint8_t *)out - current_frame->data;
735         }
736
737         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
738         return (const uint8_t *)in;
739 }
740
741 #endif  // defined(HAS_MULTIVERSIONING)
742
743 void decode_packs(const libusb_transfer *xfr,
744                   const char *sync_pattern,
745                   int sync_length,
746                   FrameAllocator::Frame *current_frame,
747                   const char *frame_type_name,
748                   function<void(const uint8_t *start)> start_callback)
749 {
750         int offset = 0;
751         for (int i = 0; i < xfr->num_iso_packets; i++) {
752                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
753
754                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
755                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
756                         continue;
757 //exit(5);
758                 }
759
760                 const uint8_t *start = xfr->buffer + offset;
761                 const uint8_t *limit = start + pack->actual_length;
762                 while (start < limit) {  // Usually runs only one iteration.
763                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
764                         if (start == limit) break;
765                         assert(start < limit);
766
767                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
768                         if (start_next_frame == nullptr) {
769                                 // add the rest of the buffer
770                                 add_to_frame(current_frame, frame_type_name, start, limit);
771                                 break;
772                         } else {
773                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
774                                 start = start_next_frame + sync_length;  // skip sync
775                                 start_callback(start);
776                         }
777                 }
778 #if 0
779                 dump_pack(xfr, offset, pack);
780 #endif
781                 offset += pack->length;
782         }
783 }
784
785 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
786 {
787         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
788             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
789                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
790                 libusb_free_transfer(xfr);
791                 exit(3);
792         }
793
794         assert(xfr->user_data != nullptr);
795         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
796
797         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
798                 if (!usb->disconnected) {
799                         fprintf(stderr, "Device went away, stopping transfers.\n");
800                         usb->disconnected = true;
801                         if (usb->card_disconnected_callback) {
802                                 usb->card_disconnected_callback();
803                         }
804                 }
805                 // Don't reschedule the transfer; the loop will stop by itself.
806                 return;
807         }
808
809         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
810                 if (xfr->endpoint == 0x84) {
811                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
812                 } else {
813                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
814
815                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
816                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
817                 }
818         }
819         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
820                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
821                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
822 #if 0
823                 if (setup->wIndex == 44) {
824                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
825                 } else {
826                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
827                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
828                 }
829 #else
830                 memcpy(usb->register_file + usb->current_register, buf, 4);
831                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
832                 if (usb->current_register == 0) {
833                         // read through all of them
834                         printf("register dump:");
835                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
836                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
837                         }
838                         printf("\n");
839                 }
840                 libusb_fill_control_setup(xfr->buffer,
841                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
842                         /*index=*/usb->current_register, /*length=*/4);
843 #endif
844         }
845
846 #if 0
847         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
848         for (i = 0; i < xfr->actual_length; i++) {
849                 printf("%02x", xfr->buffer[i]);
850                 if (i % 16)
851                         printf("\n");
852                 else if (i % 8)
853                         printf("  ");
854                 else
855                         printf(" ");
856         }
857 #endif
858
859         int rc = libusb_submit_transfer(xfr);
860         if (rc < 0) {
861                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
862                 exit(1);
863         }
864 }
865
866 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
867 {
868         if (card_connected_callback != nullptr) {
869                 libusb_device_descriptor desc;
870                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
871                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
872                         libusb_unref_device(dev);
873                         return 1;
874                 }
875
876                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
877                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
878                         card_connected_callback(dev);  // Callback takes ownership.
879                         return 0;
880                 }
881         }
882         libusb_unref_device(dev);
883         return 0;
884 }
885
886 void BMUSBCapture::usb_thread_func()
887 {
888         sched_param param;
889         memset(&param, 0, sizeof(param));
890         param.sched_priority = 1;
891         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
892                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
893         }
894         pthread_setname_np(pthread_self(), "bmusb_usb_drv");
895         while (!should_quit) {
896                 timeval sec { 1, 0 };
897                 int rc = libusb_handle_events_timeout(nullptr, &sec);
898                 if (rc != LIBUSB_SUCCESS)
899                         break;
900         }
901 }
902
903 namespace {
904
905 struct USBCardDevice {
906         uint16_t product;
907         uint8_t bus, port;
908         libusb_device *device;
909 };
910
911 const char *get_product_name(uint16_t product)
912 {
913         if (product == 0xbd3b) {
914                 return "Intensity Shuttle";
915         } else if (product == 0xbd4f) {
916                 return "UltraStudio SDI";
917         } else {
918                 assert(false);
919                 return nullptr;
920         }
921 }
922
923 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
924 {
925         const char *product_name = get_product_name(product);
926
927         char buf[256];
928         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
929                 id, bus, port, product_name);
930         return buf;
931 }
932
933 vector<USBCardDevice> find_all_cards()
934 {
935         libusb_device **devices;
936         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
937         if (num_devices == -1) {
938                 fprintf(stderr, "Error finding USB devices\n");
939                 exit(1);
940         }
941         vector<USBCardDevice> found_cards;
942         for (ssize_t i = 0; i < num_devices; ++i) {
943                 libusb_device_descriptor desc;
944                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
945                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
946                         exit(1);
947                 }
948
949                 uint8_t bus = libusb_get_bus_number(devices[i]);
950                 uint8_t port = libusb_get_port_number(devices[i]);
951
952                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
953                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
954                         libusb_unref_device(devices[i]);
955                         continue;
956                 }
957
958                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
959         }
960         libusb_free_device_list(devices, 0);
961
962         // Sort the devices to get a consistent ordering.
963         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
964                 if (a.product != b.product)
965                         return a.product < b.product;
966                 if (a.bus != b.bus)
967                         return a.bus < b.bus;
968                 return a.port < b.port;
969         });
970
971         return found_cards;
972 }
973
974 libusb_device_handle *open_card(int card_index, string *description)
975 {
976         vector<USBCardDevice> found_cards = find_all_cards();
977
978         for (size_t i = 0; i < found_cards.size(); ++i) {
979                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
980                 fprintf(stderr, "%s\n", tmp_description.c_str());
981                 if (i == size_t(card_index)) {
982                         *description = tmp_description;
983                 }
984         }
985
986         if (size_t(card_index) >= found_cards.size()) {
987                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
988                 exit(1);
989         }
990
991         libusb_device_handle *devh;
992         int rc = libusb_open(found_cards[card_index].device, &devh);
993         if (rc < 0) {
994                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
995                 exit(1);
996         }
997
998         for (size_t i = 0; i < found_cards.size(); ++i) {
999                 libusb_unref_device(found_cards[i].device);
1000         }
1001
1002         return devh;
1003 }
1004
1005 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1006 {
1007         uint8_t bus = libusb_get_bus_number(dev);
1008         uint8_t port = libusb_get_port_number(dev);
1009
1010         libusb_device_descriptor desc;
1011         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1012                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1013                 exit(1);
1014         }
1015
1016         *description = get_card_description(card_index, bus, port, desc.idProduct);
1017
1018         libusb_device_handle *devh;
1019         int rc = libusb_open(dev, &devh);
1020         if (rc < 0) {
1021                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1022                 exit(1);
1023         }
1024
1025         return devh;
1026 }
1027
1028 }  // namespace
1029
1030 unsigned BMUSBCapture::num_cards()
1031 {
1032         int rc = libusb_init(nullptr);
1033         if (rc < 0) {
1034                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1035                 exit(1);
1036         }
1037
1038         vector<USBCardDevice> found_cards = find_all_cards();
1039         unsigned ret = found_cards.size();
1040         for (size_t i = 0; i < found_cards.size(); ++i) {
1041                 libusb_unref_device(found_cards[i].device);
1042         }
1043         return ret;
1044 }
1045
1046 void BMUSBCapture::configure_card()
1047 {
1048         if (video_frame_allocator == nullptr) {
1049                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1050                 set_video_frame_allocator(owned_video_frame_allocator.get());
1051         }
1052         if (audio_frame_allocator == nullptr) {
1053                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1054                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1055         }
1056         dequeue_thread_should_quit = false;
1057         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1058
1059         int rc;
1060         struct libusb_transfer *xfr;
1061
1062         rc = libusb_init(nullptr);
1063         if (rc < 0) {
1064                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1065                 exit(1);
1066         }
1067
1068         if (dev == nullptr) {
1069                 devh = open_card(card_index, &description);
1070         } else {
1071                 devh = open_card(card_index, dev, &description);
1072                 libusb_unref_device(dev);
1073         }
1074         if (!devh) {
1075                 fprintf(stderr, "Error finding USB device\n");
1076                 exit(1);
1077         }
1078
1079         libusb_config_descriptor *config;
1080         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1081         if (rc < 0) {
1082                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1083                 exit(1);
1084         }
1085
1086 #if 0
1087         printf("%d interface\n", config->bNumInterfaces);
1088         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1089                 printf("  interface %d\n", interface_number);
1090                 const libusb_interface *interface = &config->interface[interface_number];
1091                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1092                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1093                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1094                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1095                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1096                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1097                         }
1098                 }
1099         }
1100 #endif
1101
1102         rc = libusb_set_configuration(devh, /*configuration=*/1);
1103         if (rc < 0) {
1104                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1105                 exit(1);
1106         }
1107
1108         rc = libusb_claim_interface(devh, 0);
1109         if (rc < 0) {
1110                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1111                 exit(1);
1112         }
1113
1114         // Alternate setting 1 is output, alternate setting 2 is input.
1115         // Card is reset when switching alternates, so the driver uses
1116         // this “double switch” when it wants to reset.
1117         //
1118         // There's also alternate settings 3 and 4, which seem to be
1119         // like 1 and 2 except they advertise less bandwidth needed.
1120         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1121         if (rc < 0) {
1122                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1123                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1124                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1125                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1126                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1127                 }
1128                 exit(1);
1129         }
1130         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1131         if (rc < 0) {
1132                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1133                 exit(1);
1134         }
1135 #if 0
1136         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1137         if (rc < 0) {
1138                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1139                 exit(1);
1140         }
1141 #endif
1142
1143 #if 0
1144         rc = libusb_claim_interface(devh, 3);
1145         if (rc < 0) {
1146                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1147                 exit(1);
1148         }
1149 #endif
1150
1151         // theories:
1152         //   44 is some kind of timer register (first 16 bits count upwards)
1153         //   24 is some sort of watchdog?
1154         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1155         //      (or will go to 0x73c60010?), also seen 0x73c60100
1156         //   12 also changes all the time, unclear why  
1157         //   16 seems to be autodetected mode somehow
1158         //      --    this is e00115e0 after reset?
1159         //                    ed0115e0 after mode change [to output?]
1160         //                    2d0015e0 after more mode change [to input]
1161         //                    ed0115e0 after more mode change
1162         //                    2d0015e0 after more mode change
1163         //
1164         //                    390115e0 seems to indicate we have signal
1165         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1166         //
1167         //                    200015e0 on startup
1168         //         changes to 250115e0 when we sync to the signal
1169         //
1170         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1171         //
1172         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1173         //
1174         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1175         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1176         //
1177         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1178         //    perhaps some of them are related to analog output?
1179         //
1180         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1181         //    but the driver sets it to 0x8036802a at some point.
1182         //
1183         //    all of this is on request 214/215. other requests (192, 219,
1184         //    222, 223, 224) are used for firmware upgrade. Probably best to
1185         //    stay out of it unless you know what you're doing.
1186         //
1187         //
1188         // register 16:
1189         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1190         //
1191         // theories:
1192         //   0x01 - stable signal
1193         //   0x04 - deep color
1194         //   0x08 - unknown (audio??)
1195         //   0x20 - 720p??
1196         //   0x30 - 576p??
1197
1198         update_capture_mode();
1199
1200         struct ctrl {
1201                 int endpoint;
1202                 int request;
1203                 int index;
1204                 uint32_t data;
1205         };
1206         static const ctrl ctrls[] = {
1207                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1208                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1209
1210                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1211                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1212                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1213                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1214         };
1215
1216         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1217                 uint32_t flipped = htonl(ctrls[req].data);
1218                 static uint8_t value[4];
1219                 memcpy(value, &flipped, sizeof(flipped));
1220                 int size = sizeof(value);
1221                 //if (ctrls[req].request == 215) size = 0;
1222                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1223                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1224                 if (rc < 0) {
1225                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1226                         exit(1);
1227                 }
1228
1229                 if (ctrls[req].index == 16 && rc == 4) {
1230                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1231                 }
1232
1233 #if 0
1234                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1235                 for (int i = 0; i < rc; ++i) {
1236                         printf("%02x", value[i]);
1237                 }
1238                 printf("\n");
1239 #endif
1240         }
1241
1242 #if 0
1243         // DEBUG
1244         for ( ;; ) {
1245                 static int my_index = 0;
1246                 static uint8_t value[4];
1247                 int size = sizeof(value);
1248                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1249                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1250                 if (rc < 0) {
1251                         fprintf(stderr, "Error on control\n");
1252                         exit(1);
1253                 }
1254                 printf("rc=%d index=%d: 0x", rc, my_index);
1255                 for (int i = 0; i < rc; ++i) {
1256                         printf("%02x", value[i]);
1257                 }
1258                 printf("\n");
1259         }
1260 #endif
1261
1262 #if 0
1263         // set up an asynchronous transfer of the timer register
1264         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1265         static int completed = 0;
1266
1267         xfr = libusb_alloc_transfer(0);
1268         libusb_fill_control_setup(cmdbuf,
1269             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1270                 /*index=*/44, /*length=*/4);
1271         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1272         xfr->user_data = this;
1273         libusb_submit_transfer(xfr);
1274
1275         // set up an asynchronous transfer of register 24
1276         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1277         static int completed2 = 0;
1278
1279         xfr = libusb_alloc_transfer(0);
1280         libusb_fill_control_setup(cmdbuf2,
1281             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1282                 /*index=*/24, /*length=*/4);
1283         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1284         xfr->user_data = this;
1285         libusb_submit_transfer(xfr);
1286 #endif
1287
1288         // set up an asynchronous transfer of the register dump
1289         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1290         static int completed3 = 0;
1291
1292         xfr = libusb_alloc_transfer(0);
1293         libusb_fill_control_setup(cmdbuf3,
1294             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1295                 /*index=*/current_register, /*length=*/4);
1296         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1297         xfr->user_data = this;
1298         //libusb_submit_transfer(xfr);
1299
1300         //audiofp = fopen("audio.raw", "wb");
1301
1302         // set up isochronous transfers for audio and video
1303         for (int e = 3; e <= 4; ++e) {
1304                 //int num_transfers = (e == 3) ? 6 : 6;
1305                 int num_transfers = 6;
1306                 for (int i = 0; i < num_transfers; ++i) {
1307                         size_t buf_size;
1308                         int num_iso_pack, size;
1309                         if (e == 3) {
1310                                 // Allocate for minimum width (because that will give us the most
1311                                 // number of packets, so we don't need to reallocated, but we'll
1312                                 // default to 720p for the first frame.
1313                                 size = find_xfer_size_for_width(MIN_WIDTH);
1314                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1315                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1316                         } else {
1317                                 size = 0xc0;
1318                                 num_iso_pack = 80;
1319                                 buf_size = num_iso_pack * size;
1320                         }
1321                         int num_bytes = num_iso_pack * size;
1322                         assert(size_t(num_bytes) <= buf_size);
1323 #if LIBUSB_API_VERSION >= 0x01000105
1324                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1325 #else
1326                         uint8_t *buf = nullptr;
1327 #endif
1328                         if (buf == nullptr) {
1329                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1330 #if LIBUSB_API_VERSION >= 0x01000105
1331                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1332 #else
1333                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1334 #endif
1335                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1336                                 buf = new uint8_t[num_bytes];
1337                         }
1338
1339                         xfr = libusb_alloc_transfer(num_iso_pack);
1340                         if (!xfr) {
1341                                 fprintf(stderr, "oom\n");
1342                                 exit(1);
1343                         }
1344
1345                         int ep = LIBUSB_ENDPOINT_IN | e;
1346                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1347                                 num_iso_pack, cb_xfr, nullptr, 0);
1348                         libusb_set_iso_packet_lengths(xfr, size);
1349                         xfr->user_data = this;
1350
1351                         if (e == 3) {
1352                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1353                         }
1354
1355                         iso_xfrs.push_back(xfr);
1356                 }
1357         }
1358 }
1359
1360 void BMUSBCapture::start_bm_capture()
1361 {
1362         int i = 0;
1363         for (libusb_transfer *xfr : iso_xfrs) {
1364                 int rc = libusb_submit_transfer(xfr);
1365                 ++i;
1366                 if (rc < 0) {
1367                         //printf("num_bytes=%d\n", num_bytes);
1368                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1369                                 xfr->endpoint, i, libusb_error_name(rc));
1370                         exit(1);
1371                 }
1372         }
1373
1374
1375 #if 0
1376         libusb_release_interface(devh, 0);
1377 out:
1378         if (devh)
1379                 libusb_close(devh);
1380         libusb_exit(nullptr);
1381         return rc;
1382 #endif
1383 }
1384
1385 void BMUSBCapture::stop_dequeue_thread()
1386 {
1387         dequeue_thread_should_quit = true;
1388         queues_not_empty.notify_all();
1389         dequeue_thread.join();
1390 }
1391
1392 void BMUSBCapture::start_bm_thread()
1393 {
1394         // Devices leaving are discovered by seeing the isochronous packets
1395         // coming back with errors, so only care about devices joining.
1396         if (card_connected_callback != nullptr) {
1397                 if (libusb_hotplug_register_callback(
1398                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1399                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1400                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1401                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1402                         exit(1);
1403                 }
1404         }
1405
1406         should_quit = false;
1407         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1408 }
1409
1410 void BMUSBCapture::stop_bm_thread()
1411 {
1412         should_quit = true;
1413         usb_thread.join();
1414 }
1415
1416 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1417 {
1418         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1419         VideoMode auto_mode;
1420         auto_mode.name = "Autodetect";
1421         auto_mode.autodetect = true;
1422         return {{ 0, auto_mode }};
1423 }
1424
1425 uint32_t BMUSBCapture::get_current_video_mode() const
1426 {
1427         return 0;  // Matches get_available_video_modes().
1428 }
1429
1430 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1431 {
1432         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1433 }
1434
1435 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1436 {
1437         return {
1438                 { 0x00000000, "HDMI/SDI" },
1439                 { 0x02000000, "Component" },
1440                 { 0x04000000, "Composite" },
1441                 { 0x06000000, "S-video" }
1442         };
1443 }
1444
1445 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1446 {
1447         assert((video_input_id & ~0x06000000) == 0);
1448         current_video_input = video_input_id;
1449         update_capture_mode();
1450 }
1451
1452 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1453 {
1454         return {
1455                 { 0x00000000, "Embedded" },
1456                 { 0x10000000, "Analog" }
1457         };
1458 }
1459
1460 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1461 {
1462         assert((audio_input_id & ~0x10000000) == 0);
1463         current_audio_input = audio_input_id;
1464         update_capture_mode();
1465 }
1466
1467 void BMUSBCapture::update_capture_mode()
1468 {
1469         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1470         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1471         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1472
1473         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1474                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1475         if (rc < 0) {
1476                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1477                 exit(1);
1478         }
1479 }
1480
1481 }  // namespace bmusb