]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Release v0.5.4.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 capture driver, v0.5.4
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
8 #define HAS_MULTIVERSIONING 1
9 #endif
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <libusb.h>
14 #include <unistd.h>
15 #include <netinet/in.h>
16 #include <sched.h>
17 #include <stdint.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #if HAS_MULTIVERSIONING
22 #include <immintrin.h>
23 #endif
24 #include "bmusb/bmusb.h"
25
26 #include <algorithm>
27 #include <atomic>
28 #include <chrono>
29 #include <condition_variable>
30 #include <cstddef>
31 #include <cstdint>
32 #include <deque>
33 #include <functional>
34 #include <memory>
35 #include <mutex>
36 #include <stack>
37 #include <string>
38 #include <thread>
39
40 using namespace std;
41 using namespace std::chrono;
42 using namespace std::placeholders;
43
44 #define USB_VENDOR_BLACKMAGIC 0x1edb
45 #define MIN_WIDTH 640
46 #define HEADER_SIZE 44
47 //#define HEADER_SIZE 0
48 #define AUDIO_HEADER_SIZE 4
49
50 #define FRAME_SIZE (8 << 20)  // 8 MB.
51 #define USB_VIDEO_TRANSFER_SIZE (128 << 10)  // 128 kB.
52
53 namespace bmusb {
54
55 card_connected_callback_t BMUSBCapture::card_connected_callback = nullptr;
56 bool BMUSBCapture::hotplug_existing_devices = false;
57
58 namespace {
59
60 FILE *audiofp;
61
62 thread usb_thread;
63 atomic<bool> should_quit;
64
65 int find_xfer_size_for_width(int width)
66 {
67         // Video seems to require isochronous packets scaled with the width;
68         // seemingly six lines is about right, rounded up to the required 1kB
69         // multiple.
70         int size = width * 2 * 6;
71         // Note that for 10-bit input, you'll need to increase size accordingly.
72         //size = size * 4 / 3;
73         if (size % 1024 != 0) {
74                 size &= ~1023;
75                 size += 1024;
76         }
77         return size;
78 }
79
80 void change_xfer_size_for_width(int width, libusb_transfer *xfr)
81 {
82         assert(width >= MIN_WIDTH);
83         size_t size = find_xfer_size_for_width(width);
84         int num_iso_pack = xfr->length / size;
85         if (num_iso_pack != xfr->num_iso_packets ||
86             size != xfr->iso_packet_desc[0].length) {
87                 xfr->num_iso_packets = num_iso_pack;
88                 libusb_set_iso_packet_lengths(xfr, size);
89         }
90 }
91
92 struct VideoFormatEntry {
93         uint16_t normalized_video_format;
94         unsigned width, height, second_field_start;
95         unsigned extra_lines_top, extra_lines_bottom;
96         unsigned frame_rate_nom, frame_rate_den;
97         bool interlaced;
98 };
99
100 // Get details for the given video format; returns false if detection was incomplete.
101 bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_format)
102 {
103         decoded_video_format->id = video_format;
104         decoded_video_format->interlaced = false;
105
106         // TODO: Add these for all formats as we find them.
107         decoded_video_format->extra_lines_top = decoded_video_format->extra_lines_bottom = decoded_video_format->second_field_start = 0;
108
109         if (video_format == 0x0800) {
110                 // No video signal. These green pseudo-frames seem to come at about 30.13 Hz.
111                 // It's a strange thing, but what can you do.
112                 decoded_video_format->width = 720;
113                 decoded_video_format->height = 525;
114                 decoded_video_format->extra_lines_top = 0;
115                 decoded_video_format->extra_lines_bottom = 0;
116                 decoded_video_format->frame_rate_nom = 3013;
117                 decoded_video_format->frame_rate_den = 100;
118                 decoded_video_format->has_signal = false;
119                 return true;
120         }
121         if ((video_format & 0xe800) != 0xe800) {
122                 printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
123                         video_format);
124                 decoded_video_format->width = 0;
125                 decoded_video_format->height = 0;
126                 decoded_video_format->extra_lines_top = 0;
127                 decoded_video_format->extra_lines_bottom = 0;
128                 decoded_video_format->frame_rate_nom = 60;
129                 decoded_video_format->frame_rate_den = 1;
130                 decoded_video_format->has_signal = false;
131                 return false;
132         }
133
134         decoded_video_format->has_signal = true;
135
136         // NTSC (480i59.94, I suppose). A special case, see below.
137         if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
138                 decoded_video_format->width = 720;
139                 decoded_video_format->height = 480;
140                 decoded_video_format->extra_lines_top = 17;
141                 decoded_video_format->extra_lines_bottom = 28;
142                 decoded_video_format->frame_rate_nom = 30000;
143                 decoded_video_format->frame_rate_den = 1001;
144                 decoded_video_format->second_field_start = 280;
145                 decoded_video_format->interlaced = true;
146                 return true;
147         }
148
149         // PAL (576i50, I suppose). A special case, see below.
150         if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
151                 decoded_video_format->width = 720;
152                 decoded_video_format->height = 576;
153                 decoded_video_format->extra_lines_top = 22;
154                 decoded_video_format->extra_lines_bottom = 27;
155                 decoded_video_format->frame_rate_nom = 25;
156                 decoded_video_format->frame_rate_den = 1;
157                 decoded_video_format->second_field_start = 335;
158                 decoded_video_format->interlaced = true;
159                 return true;
160         }
161
162         // 0x8 seems to be a flag about availability of deep color on the input,
163         // except when it's not (e.g. it's the only difference between NTSC
164         // and PAL). Rather confusing. But we clear it here nevertheless, because
165         // usually it doesn't mean anything.
166         //
167         // 0x4 is a flag I've only seen from the D4. I don't know what it is.
168         uint16_t normalized_video_format = video_format & ~0xe80c;
169         constexpr VideoFormatEntry entries[] = {
170                 { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
171                 { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
172                 { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
173                 { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
174                 { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
175                 { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
176                 { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
177                 { 0x01c3, 1920, 1080,   0, 20, 25,    30,    1, false },  // 1080p30.
178                 { 0x0003, 1920, 1080, 583, 20, 25,    30,    1,  true },  // 1080i60.
179                 { 0x01e1, 1920, 1080,   0, 20, 25, 30000, 1001, false },  // 1080p29.97.
180                 { 0x0021, 1920, 1080, 583, 20, 25, 30000, 1001,  true },  // 1080i59.94.
181                 { 0x0063, 1920, 1080,   0, 20, 25,    25,    1, false },  // 1080p25.
182                 { 0x0043, 1920, 1080, 583, 20, 25,    25,    1,  true },  // 1080i50.
183                 { 0x0083, 1920, 1080,   0, 20, 25,    24,    1, false },  // 1080p24.
184                 { 0x00a1, 1920, 1080,   0, 20, 25, 24000, 1001, false },  // 1080p23.98.
185         };
186         for (const VideoFormatEntry &entry : entries) {
187                 if (normalized_video_format == entry.normalized_video_format) {
188                         decoded_video_format->width = entry.width;
189                         decoded_video_format->height = entry.height;
190                         decoded_video_format->second_field_start = entry.second_field_start;
191                         decoded_video_format->extra_lines_top = entry.extra_lines_top;
192                         decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
193                         decoded_video_format->frame_rate_nom = entry.frame_rate_nom;
194                         decoded_video_format->frame_rate_den = entry.frame_rate_den;
195                         decoded_video_format->interlaced = entry.interlaced;
196                         return true;
197                 }
198         }
199
200         printf("Unknown video format 0x%04x (normalized 0x%04x). Assuming 720p60.\n", video_format, normalized_video_format);
201         decoded_video_format->width = 1280;
202         decoded_video_format->height = 720;
203         decoded_video_format->frame_rate_nom = 60;
204         decoded_video_format->frame_rate_den = 1;
205         return false;
206 }
207
208 }  // namespace
209
210 FrameAllocator::~FrameAllocator() {}
211
212 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
213         : frame_size(frame_size)
214 {
215         for (size_t i = 0; i < num_queued_frames; ++i) {
216                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
217         }
218 }
219
220 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
221 {
222         Frame vf;
223         vf.owner = this;
224
225         unique_lock<mutex> lock(freelist_mutex);  // Meh.
226         if (freelist.empty()) {
227                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
228                         frame_size);
229         } else {
230                 vf.data = freelist.top().release();
231                 vf.size = frame_size;
232                 freelist.pop();  // Meh.
233         }
234         return vf;
235 }
236
237 void MallocFrameAllocator::release_frame(Frame frame)
238 {
239         if (frame.overflow > 0) {
240                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
241         }
242         unique_lock<mutex> lock(freelist_mutex);
243         freelist.push(unique_ptr<uint8_t[]>(frame.data));
244 }
245
246 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
247 {
248         if (a == b) {
249                 return false;
250         } else if (a < b) {
251                 return (b - a < 0x8000);
252         } else {
253                 int wrap_b = 0x10000 + int(b);
254                 return (wrap_b - a < 0x8000);
255         }
256 }
257
258 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
259 {
260         unique_lock<mutex> lock(queue_lock);
261         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
262                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
263                         q->back().timecode, timecode);
264                 frame.owner->release_frame(frame);
265                 return;
266         }
267
268         QueuedFrame qf;
269         qf.format = format;
270         qf.timecode = timecode;
271         qf.frame = frame;
272         q->push_back(move(qf));
273         queues_not_empty.notify_one();  // might be spurious
274 }
275
276 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
277 {
278         FILE *fp = fopen(filename, "wb");
279         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
280                 printf("short write!\n");
281         }
282         fclose(fp);
283 }
284
285 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
286 {
287         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
288 }
289
290 void BMUSBCapture::dequeue_thread_func()
291 {
292         if (has_dequeue_callbacks) {
293                 dequeue_init_callback();
294         }
295         while (!dequeue_thread_should_quit) {
296                 unique_lock<mutex> lock(queue_lock);
297                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
298
299                 if (dequeue_thread_should_quit) break;
300
301                 uint16_t video_timecode = pending_video_frames.front().timecode;
302                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
303                 AudioFormat audio_format;
304                 audio_format.bits_per_sample = 24;
305                 audio_format.num_channels = 8;
306                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
307                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
308                                 video_timecode);
309                         QueuedFrame video_frame = pending_video_frames.front();
310                         pending_video_frames.pop_front();
311                         lock.unlock();
312                         video_frame_allocator->release_frame(video_frame.frame);
313                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
314                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
315                                 audio_timecode);
316                         QueuedFrame audio_frame = pending_audio_frames.front();
317                         pending_audio_frames.pop_front();
318                         lock.unlock();
319                         audio_format.id = audio_frame.format;
320
321                         // Use the video format of the pending frame.
322                         QueuedFrame video_frame = pending_video_frames.front();
323                         VideoFormat video_format;
324                         decode_video_format(video_frame.format, &video_format);
325
326                         frame_callback(audio_timecode,
327                                        FrameAllocator::Frame(), 0, video_format,
328                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
329                 } else {
330                         QueuedFrame video_frame = pending_video_frames.front();
331                         QueuedFrame audio_frame = pending_audio_frames.front();
332                         pending_audio_frames.pop_front();
333                         pending_video_frames.pop_front();
334                         lock.unlock();
335
336 #if 0
337                         char filename[255];
338                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
339                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
340                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
341 #endif
342
343                         VideoFormat video_format;
344                         audio_format.id = audio_frame.format;
345                         if (decode_video_format(video_frame.format, &video_format)) {
346                                 frame_callback(video_timecode,
347                                                video_frame.frame, HEADER_SIZE, video_format,
348                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
349                         } else {
350                                 frame_callback(video_timecode,
351                                                FrameAllocator::Frame(), 0, video_format,
352                                                audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
353                         }
354                 }
355         }
356         if (has_dequeue_callbacks) {
357                 dequeue_cleanup_callback();
358         }
359 }
360
361 void BMUSBCapture::start_new_frame(const uint8_t *start)
362 {
363         uint16_t format = (start[3] << 8) | start[2];
364         uint16_t timecode = (start[1] << 8) | start[0];
365
366         if (current_video_frame.len > 0) {
367                 current_video_frame.received_timestamp = steady_clock::now();
368
369                 // If format is 0x0800 (no signal), add a fake (empty) audio
370                 // frame to get it out of the queue.
371                 // TODO: Figure out if there are other formats that come with
372                 // no audio, and treat them the same.
373                 if (format == 0x0800) {
374                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
375                         if (fake_audio_frame.data == nullptr) {
376                                 // Oh well, it's just a no-signal frame anyway.
377                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
378                                 current_video_frame.owner->release_frame(current_video_frame);
379                                 current_video_frame = video_frame_allocator->alloc_frame();
380                                 return;
381                         }
382                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
383                 }
384                 //dump_frame();
385                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
386
387                 // Update the assumed frame width. We might be one frame too late on format changes,
388                 // but it's much better than asking the user to choose manually.
389                 VideoFormat video_format;
390                 if (decode_video_format(format, &video_format)) {
391                         assumed_frame_width = video_format.width;
392                 }
393         }
394         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
395         //      format, timecode,
396         //      //start[7], start[6], start[5], start[4],
397         //      read_current_frame, FRAME_SIZE);
398
399         current_video_frame = video_frame_allocator->alloc_frame();
400         //if (current_video_frame.data == nullptr) {
401         //      read_current_frame = -1;
402         //} else {
403         //      read_current_frame = 0;
404         //}
405 }
406
407 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
408 {
409         uint16_t format = (start[3] << 8) | start[2];
410         uint16_t timecode = (start[1] << 8) | start[0];
411         if (current_audio_frame.len > 0) {
412                 current_audio_frame.received_timestamp = steady_clock::now();
413                 //dump_audio_block();
414                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
415         }
416         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
417         //      format, timecode, read_current_audio_block);
418         current_audio_frame = audio_frame_allocator->alloc_frame();
419 }
420
421 #if 0
422 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
423 {
424         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
425         for (unsigned j = 0; j < pack->actual_length; j++) {
426         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
427                 printf("%02x", xfr->buffer[j + offset]);
428                 if ((j % 16) == 15)
429                         printf("\n");
430                 else if ((j % 8) == 7)
431                         printf("  ");
432                 else
433                         printf(" ");
434         }
435 }
436 #endif
437
438 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
439 {
440         assert(n % 2 == 0);
441         uint8_t *dptr1 = dest1;
442         uint8_t *dptr2 = dest2;
443
444         for (size_t i = 0; i < n; i += 2) {
445                 *dptr1++ = *src++;
446                 *dptr2++ = *src++;
447         }
448 }
449
450 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
451 {
452         if (current_frame->data == nullptr ||
453             current_frame->len > current_frame->size ||
454             start == end) {
455                 return;
456         }
457
458         int bytes = end - start;
459         if (current_frame->len + bytes > current_frame->size) {
460                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
461                 current_frame->len = current_frame->size;
462                 if (current_frame->overflow > 1048576) {
463                         printf("%d bytes overflow after last %s frame\n",
464                                 int(current_frame->overflow), frame_type_name);
465                         current_frame->overflow = 0;
466                 }
467                 //dump_frame();
468         } else {
469                 if (current_frame->interleaved) {
470                         uint8_t *data = current_frame->data + current_frame->len / 2;
471                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
472                         if (current_frame->len % 2 == 1) {
473                                 ++data;
474                                 swap(data, data2);
475                         }
476                         if (bytes % 2 == 1) {
477                                 *data++ = *start++;
478                                 swap(data, data2);
479                                 ++current_frame->len;
480                                 --bytes;
481                         }
482                         memcpy_interleaved(data, data2, start, bytes);
483                         current_frame->len += bytes;
484                 } else {
485                         memcpy(current_frame->data + current_frame->len, start, bytes);
486                         current_frame->len += bytes;
487                 }
488         }
489 }
490
491 #if 0
492 void avx2_dump(const char *name, __m256i n)
493 {
494         printf("%-10s:", name);
495         printf(" %02x", _mm256_extract_epi8(n, 0));
496         printf(" %02x", _mm256_extract_epi8(n, 1));
497         printf(" %02x", _mm256_extract_epi8(n, 2));
498         printf(" %02x", _mm256_extract_epi8(n, 3));
499         printf(" %02x", _mm256_extract_epi8(n, 4));
500         printf(" %02x", _mm256_extract_epi8(n, 5));
501         printf(" %02x", _mm256_extract_epi8(n, 6));
502         printf(" %02x", _mm256_extract_epi8(n, 7));
503         printf(" ");
504         printf(" %02x", _mm256_extract_epi8(n, 8));
505         printf(" %02x", _mm256_extract_epi8(n, 9));
506         printf(" %02x", _mm256_extract_epi8(n, 10));
507         printf(" %02x", _mm256_extract_epi8(n, 11));
508         printf(" %02x", _mm256_extract_epi8(n, 12));
509         printf(" %02x", _mm256_extract_epi8(n, 13));
510         printf(" %02x", _mm256_extract_epi8(n, 14));
511         printf(" %02x", _mm256_extract_epi8(n, 15));
512         printf(" ");
513         printf(" %02x", _mm256_extract_epi8(n, 16));
514         printf(" %02x", _mm256_extract_epi8(n, 17));
515         printf(" %02x", _mm256_extract_epi8(n, 18));
516         printf(" %02x", _mm256_extract_epi8(n, 19));
517         printf(" %02x", _mm256_extract_epi8(n, 20));
518         printf(" %02x", _mm256_extract_epi8(n, 21));
519         printf(" %02x", _mm256_extract_epi8(n, 22));
520         printf(" %02x", _mm256_extract_epi8(n, 23));
521         printf(" ");
522         printf(" %02x", _mm256_extract_epi8(n, 24));
523         printf(" %02x", _mm256_extract_epi8(n, 25));
524         printf(" %02x", _mm256_extract_epi8(n, 26));
525         printf(" %02x", _mm256_extract_epi8(n, 27));
526         printf(" %02x", _mm256_extract_epi8(n, 28));
527         printf(" %02x", _mm256_extract_epi8(n, 29));
528         printf(" %02x", _mm256_extract_epi8(n, 30));
529         printf(" %02x", _mm256_extract_epi8(n, 31));
530         printf("\n");
531 }
532 #endif
533
534 #ifndef HAS_MULTIVERSIONING
535
536 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
537 {
538         // No fast path possible unless we have multiversioning.
539         return start;
540 }
541
542 #else  // defined(HAS_MULTIVERSIONING)
543
544 __attribute__((target("sse4.1")))
545 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
546
547 __attribute__((target("avx2")))
548 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
549
550 // Does a memcpy and memchr in one to reduce processing time.
551 // Note that the benefit is somewhat limited if your L3 cache is small,
552 // as you'll (unfortunately) spend most of the time loading the data
553 // from main memory.
554 //
555 // Complicated cases are left to the slow path; it basically stops copying
556 // up until the first instance of "sync_char" (usually a bit before, actually).
557 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
558 // data, and what we really need this for is the 00 00 ff ff marker in video data.
559 __attribute__((target("default")))
560 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
561 {
562         // No fast path possible unless we have SSE 4.1 or higher.
563         return start;
564 }
565
566 __attribute__((target("sse4.1", "avx2")))
567 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
568 {
569         if (current_frame->data == nullptr ||
570             current_frame->len > current_frame->size ||
571             start == limit) {
572                 return start;
573         }
574         size_t orig_bytes = limit - start;
575         if (orig_bytes < 128) {
576                 // Don't bother.
577                 return start;
578         }
579
580         // Don't read more bytes than we can write.
581         limit = min(limit, start + (current_frame->size - current_frame->len));
582
583         // Align end to 32 bytes.
584         limit = (const uint8_t *)(intptr_t(limit) & ~31);
585
586         if (start >= limit) {
587                 return start;
588         }
589
590         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
591         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
592         if (aligned_start != start) {
593                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
594                 if (sync_start == nullptr) {
595                         add_to_frame(current_frame, "", start, aligned_start);
596                 } else {
597                         add_to_frame(current_frame, "", start, sync_start);
598                         return sync_start;
599                 }
600         }
601
602         // Make the length a multiple of 64.
603         if (current_frame->interleaved) {
604                 if (((limit - aligned_start) % 64) != 0) {
605                         limit -= 32;
606                 }
607                 assert(((limit - aligned_start) % 64) == 0);
608         }
609
610         return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
611 }
612
613 __attribute__((target("avx2")))
614 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
615 {
616         const __m256i needle = _mm256_set1_epi8(sync_char);
617
618         const __restrict __m256i *in = (const __m256i *)aligned_start;
619         if (current_frame->interleaved) {
620                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
621                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
622                 if (current_frame->len % 2 == 1) {
623                         swap(out1, out2);
624                 }
625
626                 __m256i shuffle_cw = _mm256_set_epi8(
627                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
628                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
629                 while (in < (const __m256i *)limit) {
630                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
631                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
632                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
633
634                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
635                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
636                         __m256i found = _mm256_or_si256(found1, found2);
637
638                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
639                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
640                 
641                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
642                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
643
644                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
645                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
646
647                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
648                         _mm256_storeu_si256(out2, hi);
649
650                         if (!_mm256_testz_si256(found, found)) {
651                                 break;
652                         }
653
654                         in += 2;
655                         ++out1;
656                         ++out2;
657                 }
658                 current_frame->len += (uint8_t *)in - aligned_start;
659         } else {
660                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
661                 while (in < (const __m256i *)limit) {
662                         __m256i data = _mm256_load_si256(in);
663                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
664                         __m256i found = _mm256_cmpeq_epi8(data, needle);
665                         if (!_mm256_testz_si256(found, found)) {
666                                 break;
667                         }
668
669                         ++in;
670                         ++out;
671                 }
672                 current_frame->len = (uint8_t *)out - current_frame->data;
673         }
674
675         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
676         return (const uint8_t *)in;
677 }
678
679 __attribute__((target("sse4.1")))
680 const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
681 {
682         const __m128i needle = _mm_set1_epi8(sync_char);
683
684         const __m128i *in = (const __m128i *)aligned_start;
685         if (current_frame->interleaved) {
686                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
687                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
688                 if (current_frame->len % 2 == 1) {
689                         swap(out1, out2);
690                 }
691
692                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
693                 while (in < (const __m128i *)limit) {
694                         __m128i data1 = _mm_load_si128(in);
695                         __m128i data2 = _mm_load_si128(in + 1);
696                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
697                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
698                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
699                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
700                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
701                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
702                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
703                         _mm_storeu_si128(out2, hi);
704                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
705                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
706                         if (!_mm_testz_si128(found1, found1) ||
707                             !_mm_testz_si128(found2, found2)) {
708                                 break;
709                         }
710
711                         in += 2;
712                         ++out1;
713                         ++out2;
714                 }
715                 current_frame->len += (uint8_t *)in - aligned_start;
716         } else {
717                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
718                 while (in < (const __m128i *)limit) {
719                         __m128i data = _mm_load_si128(in);
720                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
721                         __m128i found = _mm_cmpeq_epi8(data, needle);
722                         if (!_mm_testz_si128(found, found)) {
723                                 break;
724                         }
725
726                         ++in;
727                         ++out;
728                 }
729                 current_frame->len = (uint8_t *)out - current_frame->data;
730         }
731
732         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
733         return (const uint8_t *)in;
734 }
735
736 #endif  // defined(HAS_MULTIVERSIONING)
737
738 void decode_packs(const libusb_transfer *xfr,
739                   const char *sync_pattern,
740                   int sync_length,
741                   FrameAllocator::Frame *current_frame,
742                   const char *frame_type_name,
743                   function<void(const uint8_t *start)> start_callback)
744 {
745         int offset = 0;
746         for (int i = 0; i < xfr->num_iso_packets; i++) {
747                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
748
749                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
750                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
751                         continue;
752 //exit(5);
753                 }
754
755                 const uint8_t *start = xfr->buffer + offset;
756                 const uint8_t *limit = start + pack->actual_length;
757                 while (start < limit) {  // Usually runs only one iteration.
758                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
759                         if (start == limit) break;
760                         assert(start < limit);
761
762                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
763                         if (start_next_frame == nullptr) {
764                                 // add the rest of the buffer
765                                 add_to_frame(current_frame, frame_type_name, start, limit);
766                                 break;
767                         } else {
768                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
769                                 start = start_next_frame + sync_length;  // skip sync
770                                 start_callback(start);
771                         }
772                 }
773 #if 0
774                 dump_pack(xfr, offset, pack);
775 #endif
776                 offset += pack->length;
777         }
778 }
779
780 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
781 {
782         if (xfr->status != LIBUSB_TRANSFER_COMPLETED &&
783             xfr->status != LIBUSB_TRANSFER_NO_DEVICE) {
784                 fprintf(stderr, "error: transfer status %d\n", xfr->status);
785                 libusb_free_transfer(xfr);
786                 exit(3);
787         }
788
789         assert(xfr->user_data != nullptr);
790         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
791
792         if (xfr->status == LIBUSB_TRANSFER_NO_DEVICE) {
793                 if (!usb->disconnected) {
794                         fprintf(stderr, "Device went away, stopping transfers.\n");
795                         usb->disconnected = true;
796                         if (usb->card_disconnected_callback) {
797                                 usb->card_disconnected_callback();
798                         }
799                 }
800                 // Don't reschedule the transfer; the loop will stop by itself.
801                 return;
802         }
803
804         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
805                 if (xfr->endpoint == 0x84) {
806                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
807                 } else {
808                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
809
810                         // Update the transfer with the new assumed width, if we're in the process of changing formats.
811                         change_xfer_size_for_width(usb->assumed_frame_width, xfr);
812                 }
813         }
814         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
815                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
816                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
817 #if 0
818                 if (setup->wIndex == 44) {
819                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
820                 } else {
821                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
822                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
823                 }
824 #else
825                 memcpy(usb->register_file + usb->current_register, buf, 4);
826                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
827                 if (usb->current_register == 0) {
828                         // read through all of them
829                         printf("register dump:");
830                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
831                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
832                         }
833                         printf("\n");
834                 }
835                 libusb_fill_control_setup(xfr->buffer,
836                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
837                         /*index=*/usb->current_register, /*length=*/4);
838 #endif
839         }
840
841 #if 0
842         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
843         for (i = 0; i < xfr->actual_length; i++) {
844                 printf("%02x", xfr->buffer[i]);
845                 if (i % 16)
846                         printf("\n");
847                 else if (i % 8)
848                         printf("  ");
849                 else
850                         printf(" ");
851         }
852 #endif
853
854         int rc = libusb_submit_transfer(xfr);
855         if (rc < 0) {
856                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
857                 exit(1);
858         }
859 }
860
861 int BMUSBCapture::cb_hotplug(libusb_context *ctx, libusb_device *dev, libusb_hotplug_event event, void *user_data)
862 {
863         if (card_connected_callback != nullptr) {
864                 libusb_device_descriptor desc;
865                 if (libusb_get_device_descriptor(dev, &desc) < 0) {
866                         fprintf(stderr, "Error getting device descriptor for hotplugged device %p, killing hotplug\n", dev);
867                         libusb_unref_device(dev);
868                         return 1;
869                 }
870
871                 if ((desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) ||
872                     (desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
873                         card_connected_callback(dev);  // Callback takes ownership.
874                         return 0;
875                 }
876         }
877         libusb_unref_device(dev);
878         return 0;
879 }
880
881 void BMUSBCapture::usb_thread_func()
882 {
883         sched_param param;
884         memset(&param, 0, sizeof(param));
885         param.sched_priority = 1;
886         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
887                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
888         }
889         while (!should_quit) {
890                 timeval sec { 1, 0 };
891                 int rc = libusb_handle_events_timeout(nullptr, &sec);
892                 if (rc != LIBUSB_SUCCESS)
893                         break;
894         }
895 }
896
897 namespace {
898
899 struct USBCardDevice {
900         uint16_t product;
901         uint8_t bus, port;
902         libusb_device *device;
903 };
904
905 const char *get_product_name(uint16_t product)
906 {
907         if (product == 0xbd3b) {
908                 return "Intensity Shuttle";
909         } else if (product == 0xbd4f) {
910                 return "UltraStudio SDI";
911         } else {
912                 assert(false);
913                 return nullptr;
914         }
915 }
916
917 string get_card_description(int id, uint8_t bus, uint8_t port, uint16_t product)
918 {
919         const char *product_name = get_product_name(product);
920
921         char buf[256];
922         snprintf(buf, sizeof(buf), "USB card %d: Bus %03u Device %03u  %s",
923                 id, bus, port, product_name);
924         return buf;
925 }
926
927 vector<USBCardDevice> find_all_cards()
928 {
929         libusb_device **devices;
930         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
931         if (num_devices == -1) {
932                 fprintf(stderr, "Error finding USB devices\n");
933                 exit(1);
934         }
935         vector<USBCardDevice> found_cards;
936         for (ssize_t i = 0; i < num_devices; ++i) {
937                 libusb_device_descriptor desc;
938                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
939                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
940                         exit(1);
941                 }
942
943                 uint8_t bus = libusb_get_bus_number(devices[i]);
944                 uint8_t port = libusb_get_port_number(devices[i]);
945
946                 if (!(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd3b) &&
947                     !(desc.idVendor == USB_VENDOR_BLACKMAGIC && desc.idProduct == 0xbd4f)) {
948                         libusb_unref_device(devices[i]);
949                         continue;
950                 }
951
952                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
953         }
954         libusb_free_device_list(devices, 0);
955
956         // Sort the devices to get a consistent ordering.
957         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
958                 if (a.product != b.product)
959                         return a.product < b.product;
960                 if (a.bus != b.bus)
961                         return a.bus < b.bus;
962                 return a.port < b.port;
963         });
964
965         return found_cards;
966 }
967
968 libusb_device_handle *open_card(int card_index, string *description)
969 {
970         vector<USBCardDevice> found_cards = find_all_cards();
971
972         for (size_t i = 0; i < found_cards.size(); ++i) {
973                 string tmp_description = get_card_description(i, found_cards[i].bus, found_cards[i].port, found_cards[i].product);
974                 fprintf(stderr, "%s\n", tmp_description.c_str());
975                 if (i == size_t(card_index)) {
976                         *description = tmp_description;
977                 }
978         }
979
980         if (size_t(card_index) >= found_cards.size()) {
981                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
982                 exit(1);
983         }
984
985         libusb_device_handle *devh;
986         int rc = libusb_open(found_cards[card_index].device, &devh);
987         if (rc < 0) {
988                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
989                 exit(1);
990         }
991
992         for (size_t i = 0; i < found_cards.size(); ++i) {
993                 libusb_unref_device(found_cards[i].device);
994         }
995
996         return devh;
997 }
998
999 libusb_device_handle *open_card(unsigned card_index, libusb_device *dev, string *description)
1000 {
1001         uint8_t bus = libusb_get_bus_number(dev);
1002         uint8_t port = libusb_get_port_number(dev);
1003
1004         libusb_device_descriptor desc;
1005         if (libusb_get_device_descriptor(dev, &desc) < 0) {
1006                 fprintf(stderr, "Error getting device descriptor for device %p\n", dev);
1007                 exit(1);
1008         }
1009
1010         *description = get_card_description(card_index, bus, port, desc.idProduct);
1011
1012         libusb_device_handle *devh;
1013         int rc = libusb_open(dev, &devh);
1014         if (rc < 0) {
1015                 fprintf(stderr, "Error opening card %p: %s\n", dev, libusb_error_name(rc));
1016                 exit(1);
1017         }
1018
1019         return devh;
1020 }
1021
1022 }  // namespace
1023
1024 unsigned BMUSBCapture::num_cards()
1025 {
1026         int rc = libusb_init(nullptr);
1027         if (rc < 0) {
1028                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1029                 exit(1);
1030         }
1031
1032         vector<USBCardDevice> found_cards = find_all_cards();
1033         unsigned ret = found_cards.size();
1034         for (size_t i = 0; i < found_cards.size(); ++i) {
1035                 libusb_unref_device(found_cards[i].device);
1036         }
1037         return ret;
1038 }
1039
1040 void BMUSBCapture::configure_card()
1041 {
1042         if (video_frame_allocator == nullptr) {
1043                 owned_video_frame_allocator.reset(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));
1044                 set_video_frame_allocator(owned_video_frame_allocator.get());
1045         }
1046         if (audio_frame_allocator == nullptr) {
1047                 owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
1048                 set_audio_frame_allocator(owned_audio_frame_allocator.get());
1049         }
1050         dequeue_thread_should_quit = false;
1051         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
1052
1053         int rc;
1054         struct libusb_transfer *xfr;
1055
1056         rc = libusb_init(nullptr);
1057         if (rc < 0) {
1058                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
1059                 exit(1);
1060         }
1061
1062         if (dev == nullptr) {
1063                 devh = open_card(card_index, &description);
1064         } else {
1065                 devh = open_card(card_index, dev, &description);
1066                 libusb_unref_device(dev);
1067         }
1068         if (!devh) {
1069                 fprintf(stderr, "Error finding USB device\n");
1070                 exit(1);
1071         }
1072
1073         libusb_config_descriptor *config;
1074         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
1075         if (rc < 0) {
1076                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
1077                 exit(1);
1078         }
1079
1080 #if 0
1081         printf("%d interface\n", config->bNumInterfaces);
1082         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
1083                 printf("  interface %d\n", interface_number);
1084                 const libusb_interface *interface = &config->interface[interface_number];
1085                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
1086                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
1087                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
1088                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
1089                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
1090                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
1091                         }
1092                 }
1093         }
1094 #endif
1095
1096         rc = libusb_set_configuration(devh, /*configuration=*/1);
1097         if (rc < 0) {
1098                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
1099                 exit(1);
1100         }
1101
1102         rc = libusb_claim_interface(devh, 0);
1103         if (rc < 0) {
1104                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
1105                 exit(1);
1106         }
1107
1108         // Alternate setting 1 is output, alternate setting 2 is input.
1109         // Card is reset when switching alternates, so the driver uses
1110         // this “double switch” when it wants to reset.
1111         //
1112         // There's also alternate settings 3 and 4, which seem to be
1113         // like 1 and 2 except they advertise less bandwidth needed.
1114         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1115         if (rc < 0) {
1116                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1117                 if (rc == LIBUSB_ERROR_NOT_FOUND) {
1118                         fprintf(stderr, "This is usually because the card came up in USB2 mode.\n");
1119                         fprintf(stderr, "In particular, this tends to happen if you boot up with the\n");
1120                         fprintf(stderr, "card plugged in; just unplug and replug it, and it usually works.\n");
1121                 }
1122                 exit(1);
1123         }
1124         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
1125         if (rc < 0) {
1126                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
1127                 exit(1);
1128         }
1129 #if 0
1130         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
1131         if (rc < 0) {
1132                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
1133                 exit(1);
1134         }
1135 #endif
1136
1137 #if 0
1138         rc = libusb_claim_interface(devh, 3);
1139         if (rc < 0) {
1140                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
1141                 exit(1);
1142         }
1143 #endif
1144
1145         // theories:
1146         //   44 is some kind of timer register (first 16 bits count upwards)
1147         //   24 is some sort of watchdog?
1148         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
1149         //      (or will go to 0x73c60010?), also seen 0x73c60100
1150         //   12 also changes all the time, unclear why  
1151         //   16 seems to be autodetected mode somehow
1152         //      --    this is e00115e0 after reset?
1153         //                    ed0115e0 after mode change [to output?]
1154         //                    2d0015e0 after more mode change [to input]
1155         //                    ed0115e0 after more mode change
1156         //                    2d0015e0 after more mode change
1157         //
1158         //                    390115e0 seems to indicate we have signal
1159         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
1160         //
1161         //                    200015e0 on startup
1162         //         changes to 250115e0 when we sync to the signal
1163         //
1164         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
1165         //
1166         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
1167         //
1168         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
1169         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
1170         //
1171         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
1172         //    perhaps some of them are related to analog output?
1173         //
1174         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
1175         //    but the driver sets it to 0x8036802a at some point.
1176         //
1177         //    all of this is on request 214/215. other requests (192, 219,
1178         //    222, 223, 224) are used for firmware upgrade. Probably best to
1179         //    stay out of it unless you know what you're doing.
1180         //
1181         //
1182         // register 16:
1183         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
1184         //
1185         // theories:
1186         //   0x01 - stable signal
1187         //   0x04 - deep color
1188         //   0x08 - unknown (audio??)
1189         //   0x20 - 720p??
1190         //   0x30 - 576p??
1191
1192         update_capture_mode();
1193
1194         struct ctrl {
1195                 int endpoint;
1196                 int request;
1197                 int index;
1198                 uint32_t data;
1199         };
1200         static const ctrl ctrls[] = {
1201                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
1202                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
1203
1204                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
1205                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
1206                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
1207                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
1208         };
1209
1210         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
1211                 uint32_t flipped = htonl(ctrls[req].data);
1212                 static uint8_t value[4];
1213                 memcpy(value, &flipped, sizeof(flipped));
1214                 int size = sizeof(value);
1215                 //if (ctrls[req].request == 215) size = 0;
1216                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
1217                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
1218                 if (rc < 0) {
1219                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
1220                         exit(1);
1221                 }
1222
1223                 if (ctrls[req].index == 16 && rc == 4) {
1224                         printf("Card firmware version: 0x%02x%02x\n", value[2], value[3]);
1225                 }
1226
1227 #if 0
1228                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
1229                 for (int i = 0; i < rc; ++i) {
1230                         printf("%02x", value[i]);
1231                 }
1232                 printf("\n");
1233 #endif
1234         }
1235
1236 #if 0
1237         // DEBUG
1238         for ( ;; ) {
1239                 static int my_index = 0;
1240                 static uint8_t value[4];
1241                 int size = sizeof(value);
1242                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
1243                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
1244                 if (rc < 0) {
1245                         fprintf(stderr, "Error on control\n");
1246                         exit(1);
1247                 }
1248                 printf("rc=%d index=%d: 0x", rc, my_index);
1249                 for (int i = 0; i < rc; ++i) {
1250                         printf("%02x", value[i]);
1251                 }
1252                 printf("\n");
1253         }
1254 #endif
1255
1256 #if 0
1257         // set up an asynchronous transfer of the timer register
1258         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
1259         static int completed = 0;
1260
1261         xfr = libusb_alloc_transfer(0);
1262         libusb_fill_control_setup(cmdbuf,
1263             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1264                 /*index=*/44, /*length=*/4);
1265         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
1266         xfr->user_data = this;
1267         libusb_submit_transfer(xfr);
1268
1269         // set up an asynchronous transfer of register 24
1270         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
1271         static int completed2 = 0;
1272
1273         xfr = libusb_alloc_transfer(0);
1274         libusb_fill_control_setup(cmdbuf2,
1275             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1276                 /*index=*/24, /*length=*/4);
1277         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
1278         xfr->user_data = this;
1279         libusb_submit_transfer(xfr);
1280 #endif
1281
1282         // set up an asynchronous transfer of the register dump
1283         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
1284         static int completed3 = 0;
1285
1286         xfr = libusb_alloc_transfer(0);
1287         libusb_fill_control_setup(cmdbuf3,
1288             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
1289                 /*index=*/current_register, /*length=*/4);
1290         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
1291         xfr->user_data = this;
1292         //libusb_submit_transfer(xfr);
1293
1294         //audiofp = fopen("audio.raw", "wb");
1295
1296         // set up isochronous transfers for audio and video
1297         for (int e = 3; e <= 4; ++e) {
1298                 //int num_transfers = (e == 3) ? 6 : 6;
1299                 int num_transfers = 6;
1300                 for (int i = 0; i < num_transfers; ++i) {
1301                         size_t buf_size;
1302                         int num_iso_pack, size;
1303                         if (e == 3) {
1304                                 // Allocate for minimum width (because that will give us the most
1305                                 // number of packets, so we don't need to reallocated, but we'll
1306                                 // default to 720p for the first frame.
1307                                 size = find_xfer_size_for_width(MIN_WIDTH);
1308                                 num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
1309                                 buf_size = USB_VIDEO_TRANSFER_SIZE;
1310                         } else {
1311                                 size = 0xc0;
1312                                 num_iso_pack = 80;
1313                                 buf_size = num_iso_pack * size;
1314                         }
1315                         int num_bytes = num_iso_pack * size;
1316                         assert(size_t(num_bytes) <= buf_size);
1317 #if LIBUSB_API_VERSION >= 0x01000105
1318                         uint8_t *buf = libusb_dev_mem_alloc(devh, num_bytes);
1319 #else
1320                         uint8_t *buf = nullptr;
1321 #endif
1322                         if (buf == nullptr) {
1323                                 fprintf(stderr, "Failed to allocate persistent DMA memory ");
1324 #if LIBUSB_API_VERSION >= 0x01000105
1325                                 fprintf(stderr, "(probably too old kernel; use 4.6.0 or newer).\n");
1326 #else
1327                                 fprintf(stderr, "(compiled against too old libusb-1.0).\n");
1328 #endif
1329                                 fprintf(stderr, "Will go slower, and likely fail due to memory fragmentation after a few hours.\n");
1330                                 buf = new uint8_t[num_bytes];
1331                         }
1332
1333                         xfr = libusb_alloc_transfer(num_iso_pack);
1334                         if (!xfr) {
1335                                 fprintf(stderr, "oom\n");
1336                                 exit(1);
1337                         }
1338
1339                         int ep = LIBUSB_ENDPOINT_IN | e;
1340                         libusb_fill_iso_transfer(xfr, devh, ep, buf, buf_size,
1341                                 num_iso_pack, cb_xfr, nullptr, 0);
1342                         libusb_set_iso_packet_lengths(xfr, size);
1343                         xfr->user_data = this;
1344
1345                         if (e == 3) {
1346                                 change_xfer_size_for_width(assumed_frame_width, xfr);
1347                         }
1348
1349                         iso_xfrs.push_back(xfr);
1350                 }
1351         }
1352 }
1353
1354 void BMUSBCapture::start_bm_capture()
1355 {
1356         int i = 0;
1357         for (libusb_transfer *xfr : iso_xfrs) {
1358                 int rc = libusb_submit_transfer(xfr);
1359                 ++i;
1360                 if (rc < 0) {
1361                         //printf("num_bytes=%d\n", num_bytes);
1362                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1363                                 xfr->endpoint, i, libusb_error_name(rc));
1364                         exit(1);
1365                 }
1366         }
1367
1368
1369 #if 0
1370         libusb_release_interface(devh, 0);
1371 out:
1372         if (devh)
1373                 libusb_close(devh);
1374         libusb_exit(nullptr);
1375         return rc;
1376 #endif
1377 }
1378
1379 void BMUSBCapture::stop_dequeue_thread()
1380 {
1381         dequeue_thread_should_quit = true;
1382         queues_not_empty.notify_all();
1383         dequeue_thread.join();
1384 }
1385
1386 void BMUSBCapture::start_bm_thread()
1387 {
1388         // Devices leaving are discovered by seeing the isochronous packets
1389         // coming back with errors, so only care about devices joining.
1390         if (card_connected_callback != nullptr) {
1391                 if (libusb_hotplug_register_callback(
1392                         nullptr, LIBUSB_HOTPLUG_EVENT_DEVICE_ARRIVED, hotplug_existing_devices ? LIBUSB_HOTPLUG_ENUMERATE : LIBUSB_HOTPLUG_NO_FLAGS,
1393                         USB_VENDOR_BLACKMAGIC, LIBUSB_HOTPLUG_MATCH_ANY, LIBUSB_HOTPLUG_MATCH_ANY,
1394                         &BMUSBCapture::cb_hotplug, nullptr, nullptr) < 0) {
1395                         fprintf(stderr, "libusb_hotplug_register_callback() failed\n");
1396                         exit(1);
1397                 }
1398         }
1399
1400         should_quit = false;
1401         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1402 }
1403
1404 void BMUSBCapture::stop_bm_thread()
1405 {
1406         should_quit = true;
1407         usb_thread.join();
1408 }
1409
1410 map<uint32_t, VideoMode> BMUSBCapture::get_available_video_modes() const
1411 {
1412         // The USB3 cards autodetect, and seem to have no provision for forcing modes.
1413         VideoMode auto_mode;
1414         auto_mode.name = "Autodetect";
1415         auto_mode.autodetect = true;
1416         return {{ 0, auto_mode }};
1417 }
1418
1419 uint32_t BMUSBCapture::get_current_video_mode() const
1420 {
1421         return 0;  // Matches get_available_video_modes().
1422 }
1423
1424 void BMUSBCapture::set_video_mode(uint32_t video_mode_id)
1425 {
1426         assert(video_mode_id == 0);  // Matches get_available_video_modes().
1427 }
1428
1429 std::map<uint32_t, std::string> BMUSBCapture::get_available_video_inputs() const
1430 {
1431         return {
1432                 { 0x00000000, "HDMI/SDI" },
1433                 { 0x02000000, "Component" },
1434                 { 0x04000000, "Composite" },
1435                 { 0x06000000, "S-video" }
1436         };
1437 }
1438
1439 void BMUSBCapture::set_video_input(uint32_t video_input_id)
1440 {
1441         assert((video_input_id & ~0x06000000) == 0);
1442         current_video_input = video_input_id;
1443         update_capture_mode();
1444 }
1445
1446 std::map<uint32_t, std::string> BMUSBCapture::get_available_audio_inputs() const
1447 {
1448         return {
1449                 { 0x00000000, "Embedded" },
1450                 { 0x10000000, "Analog" }
1451         };
1452 }
1453
1454 void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
1455 {
1456         assert((audio_input_id & ~0x10000000) == 0);
1457         current_audio_input = audio_input_id;
1458         update_capture_mode();
1459 }
1460
1461 void BMUSBCapture::update_capture_mode()
1462 {
1463         // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
1464         // clearing the 0x08000000 bit seems to change the capture format (other source?)
1465         uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
1466
1467         int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
1468                 /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);
1469         if (rc < 0) {
1470                 fprintf(stderr, "Error on setting mode: %s\n", libusb_error_name(rc));
1471                 exit(1);
1472         }
1473 }
1474
1475 }  // namespace bmusb