]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Use even smaller transfers since evidently the kernel has problems allocating 512...
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 // Audio is more important than video, and also much cheaper.
56 // By having many more audio frames available, hopefully if something
57 // starts to drop, we'll have CPU load go down (from not having to
58 // process as much video) before we have to drop audio.
59 #define NUM_QUEUED_VIDEO_FRAMES 16
60 #define NUM_QUEUED_AUDIO_FRAMES 64
61
62 class MallocFrameAllocator : public FrameAllocator {
63 public:
64         MallocFrameAllocator(size_t frame_size, size_t num_queued_frames);
65         Frame alloc_frame() override;
66         void release_frame(Frame frame) override;
67
68 private:
69         size_t frame_size;
70
71         mutex freelist_mutex;
72         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
73 };
74
75 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
76         : frame_size(frame_size)
77 {
78         for (size_t i = 0; i < num_queued_frames; ++i) {
79                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
80         }
81 }
82
83 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
84 {
85         Frame vf;
86         vf.owner = this;
87
88         unique_lock<mutex> lock(freelist_mutex);  // Meh.
89         if (freelist.empty()) {
90                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
91                         frame_size);
92         } else {
93                 vf.data = freelist.top().release();
94                 vf.size = frame_size;
95                 freelist.pop();  // Meh.
96         }
97         return vf;
98 }
99
100 void MallocFrameAllocator::release_frame(Frame frame)
101 {
102         if (frame.overflow > 0) {
103                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
104         }
105         unique_lock<mutex> lock(freelist_mutex);
106         freelist.push(unique_ptr<uint8_t[]>(frame.data));
107 }
108
109 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
110 {
111         if (a == b) {
112                 return false;
113         } else if (a < b) {
114                 return (b - a < 0x8000);
115         } else {
116                 int wrap_b = 0x10000 + int(b);
117                 return (wrap_b - a < 0x8000);
118         }
119 }
120
121 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
122 {
123         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
124                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
125                         q->back().timecode, timecode);
126                 frame.owner->release_frame(frame);
127                 return;
128         }
129
130         QueuedFrame qf;
131         qf.format = format;
132         qf.timecode = timecode;
133         qf.frame = frame;
134
135         {
136                 unique_lock<mutex> lock(queue_lock);
137                 q->push_back(move(qf));
138         }
139         queues_not_empty.notify_one();  // might be spurious
140 }
141
142 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
143 {
144         FILE *fp = fopen(filename, "wb");
145         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
146                 printf("short write!\n");
147         }
148         fclose(fp);
149 }
150
151 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
152 {
153         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
154 }
155
156 void BMUSBCapture::dequeue_thread_func()
157 {
158         if (has_dequeue_callbacks) {
159                 dequeue_init_callback();
160         }
161         while (!dequeue_thread_should_quit) {
162                 unique_lock<mutex> lock(queue_lock);
163                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
164
165                 uint16_t video_timecode = pending_video_frames.front().timecode;
166                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
167                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
168                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
169                                 video_timecode);
170                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
171                         pending_video_frames.pop_front();
172                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
173                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
174                                 audio_timecode);
175                         QueuedFrame audio_frame = pending_audio_frames.front();
176                         pending_audio_frames.pop_front();
177                         lock.unlock();
178                         frame_callback(audio_timecode,
179                                        FrameAllocator::Frame(), 0, 0x0000,
180                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
181                 } else {
182                         QueuedFrame video_frame = pending_video_frames.front();
183                         QueuedFrame audio_frame = pending_audio_frames.front();
184                         pending_audio_frames.pop_front();
185                         pending_video_frames.pop_front();
186                         lock.unlock();
187
188 #if 0
189                         char filename[255];
190                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
191                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
192                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
193 #endif
194
195                         frame_callback(video_timecode,
196                                        video_frame.frame, HEADER_SIZE, video_frame.format,
197                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
198                 }
199         }
200         if (has_dequeue_callbacks) {
201                 dequeue_cleanup_callback();
202         }
203 }
204
205 void BMUSBCapture::start_new_frame(const uint8_t *start)
206 {
207         uint16_t format = (start[3] << 8) | start[2];
208         uint16_t timecode = (start[1] << 8) | start[0];
209
210         if (current_video_frame.len > 0) {
211                 // If format is 0x0800 (no signal), add a fake (empty) audio
212                 // frame to get it out of the queue.
213                 // TODO: Figure out if there are other formats that come with
214                 // no audio, and treat them the same.
215                 if (format == 0x0800) {
216                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
217                         if (fake_audio_frame.data == nullptr) {
218                                 // Oh well, it's just a no-signal frame anyway.
219                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
220                                 current_video_frame.owner->release_frame(current_video_frame);
221                                 current_video_frame = video_frame_allocator->alloc_frame();
222                                 return;
223                         }
224                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
225                 }
226                 //dump_frame();
227                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
228         }
229         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
230         //      format, timecode,
231         //      //start[7], start[6], start[5], start[4],
232         //      read_current_frame, FRAME_SIZE);
233
234         current_video_frame = video_frame_allocator->alloc_frame();
235         //if (current_video_frame.data == nullptr) {
236         //      read_current_frame = -1;
237         //} else {
238         //      read_current_frame = 0;
239         //}
240 }
241
242 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
243 {
244         uint16_t format = (start[3] << 8) | start[2];
245         uint16_t timecode = (start[1] << 8) | start[0];
246         if (current_audio_frame.len > 0) {
247                 //dump_audio_block();
248                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
249         }
250         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
251         //      format, timecode, read_current_audio_block);
252         current_audio_frame = audio_frame_allocator->alloc_frame();
253 }
254
255 #if 0
256 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
257 {
258         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
259         for (unsigned j = 0; j < pack->actual_length; j++) {
260         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
261                 printf("%02x", xfr->buffer[j + offset]);
262                 if ((j % 16) == 15)
263                         printf("\n");
264                 else if ((j % 8) == 7)
265                         printf("  ");
266                 else
267                         printf(" ");
268         }
269 }
270 #endif
271
272 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
273 {
274         assert(n % 2 == 0);
275         uint8_t *dptr1 = dest1;
276         uint8_t *dptr2 = dest2;
277
278         for (size_t i = 0; i < n; i += 2) {
279                 *dptr1++ = *src++;
280                 *dptr2++ = *src++;
281         }
282 }
283
284 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
285 {
286         if (current_frame->data == nullptr ||
287             current_frame->len > current_frame->size ||
288             start == end) {
289                 return;
290         }
291
292         int bytes = end - start;
293         if (current_frame->len + bytes > current_frame->size) {
294                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
295                 current_frame->len = current_frame->size;
296                 if (current_frame->overflow > 1048576) {
297                         printf("%d bytes overflow after last %s frame\n",
298                                 int(current_frame->overflow), frame_type_name);
299                         current_frame->overflow = 0;
300                 }
301                 //dump_frame();
302         } else {
303                 if (current_frame->interleaved) {
304                         uint8_t *data = current_frame->data + current_frame->len / 2;
305                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
306                         if (current_frame->len % 2 == 1) {
307                                 ++data;
308                                 swap(data, data2);
309                         }
310                         if (bytes % 2 == 1) {
311                                 *data++ = *start++;
312                                 swap(data, data2);
313                                 ++current_frame->len;
314                                 --bytes;
315                         }
316                         memcpy_interleaved(data, data2, start, bytes);
317                         current_frame->len += bytes;
318                 } else {
319                         memcpy(current_frame->data + current_frame->len, start, bytes);
320                         current_frame->len += bytes;
321                 }
322         }
323 }
324
325 #ifdef __SSE4_1__
326
327 #if 0
328 void avx2_dump(const char *name, __m256i n)
329 {
330         printf("%-10s:", name);
331         printf(" %02x", _mm256_extract_epi8(n, 0));
332         printf(" %02x", _mm256_extract_epi8(n, 1));
333         printf(" %02x", _mm256_extract_epi8(n, 2));
334         printf(" %02x", _mm256_extract_epi8(n, 3));
335         printf(" %02x", _mm256_extract_epi8(n, 4));
336         printf(" %02x", _mm256_extract_epi8(n, 5));
337         printf(" %02x", _mm256_extract_epi8(n, 6));
338         printf(" %02x", _mm256_extract_epi8(n, 7));
339         printf(" ");
340         printf(" %02x", _mm256_extract_epi8(n, 8));
341         printf(" %02x", _mm256_extract_epi8(n, 9));
342         printf(" %02x", _mm256_extract_epi8(n, 10));
343         printf(" %02x", _mm256_extract_epi8(n, 11));
344         printf(" %02x", _mm256_extract_epi8(n, 12));
345         printf(" %02x", _mm256_extract_epi8(n, 13));
346         printf(" %02x", _mm256_extract_epi8(n, 14));
347         printf(" %02x", _mm256_extract_epi8(n, 15));
348         printf(" ");
349         printf(" %02x", _mm256_extract_epi8(n, 16));
350         printf(" %02x", _mm256_extract_epi8(n, 17));
351         printf(" %02x", _mm256_extract_epi8(n, 18));
352         printf(" %02x", _mm256_extract_epi8(n, 19));
353         printf(" %02x", _mm256_extract_epi8(n, 20));
354         printf(" %02x", _mm256_extract_epi8(n, 21));
355         printf(" %02x", _mm256_extract_epi8(n, 22));
356         printf(" %02x", _mm256_extract_epi8(n, 23));
357         printf(" ");
358         printf(" %02x", _mm256_extract_epi8(n, 24));
359         printf(" %02x", _mm256_extract_epi8(n, 25));
360         printf(" %02x", _mm256_extract_epi8(n, 26));
361         printf(" %02x", _mm256_extract_epi8(n, 27));
362         printf(" %02x", _mm256_extract_epi8(n, 28));
363         printf(" %02x", _mm256_extract_epi8(n, 29));
364         printf(" %02x", _mm256_extract_epi8(n, 30));
365         printf(" %02x", _mm256_extract_epi8(n, 31));
366         printf("\n");
367 }
368 #endif
369
370 // Does a memcpy and memchr in one to reduce processing time.
371 // Note that the benefit is somewhat limited if your L3 cache is small,
372 // as you'll (unfortunately) spend most of the time loading the data
373 // from main memory.
374 //
375 // Complicated cases are left to the slow path; it basically stops copying
376 // up until the first instance of "sync_char" (usually a bit before, actually).
377 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
378 // data, and what we really need this for is the 00 00 ff ff marker in video data.
379 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
380 {
381         if (current_frame->data == nullptr ||
382             current_frame->len > current_frame->size ||
383             start == limit) {
384                 return start;
385         }
386         size_t orig_bytes = limit - start;
387         if (orig_bytes < 128) {
388                 // Don't bother.
389                 return start;
390         }
391
392         // Don't read more bytes than we can write.
393         limit = min(limit, start + (current_frame->size - current_frame->len));
394
395         // Align end to 32 bytes.
396         limit = (const uint8_t *)(intptr_t(limit) & ~31);
397
398         if (start >= limit) {
399                 return start;
400         }
401
402         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
403         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
404         if (aligned_start != start) {
405                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
406                 if (sync_start == nullptr) {
407                         add_to_frame(current_frame, "", start, aligned_start);
408                 } else {
409                         add_to_frame(current_frame, "", start, sync_start);
410                         return sync_start;
411                 }
412         }
413
414         // Make the length a multiple of 64.
415         if (current_frame->interleaved) {
416                 if (((limit - aligned_start) % 64) != 0) {
417                         limit -= 32;
418                 }
419                 assert(((limit - aligned_start) % 64) == 0);
420         }
421
422 #if __AVX2__
423         const __m256i needle = _mm256_set1_epi8(sync_char);
424
425         const __restrict __m256i *in = (const __m256i *)aligned_start;
426         if (current_frame->interleaved) {
427                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
428                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
429                 if (current_frame->len % 2 == 1) {
430                         swap(out1, out2);
431                 }
432
433                 __m256i shuffle_cw = _mm256_set_epi8(
434                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
435                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
436                 while (in < (const __m256i *)limit) {
437                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
438                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
439                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
440
441                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
442                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
443                         __m256i found = _mm256_or_si256(found1, found2);
444
445                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
446                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
447                 
448                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
449                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
450
451                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
452                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
453
454                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
455                         _mm256_storeu_si256(out2, hi);
456
457                         if (!_mm256_testz_si256(found, found)) {
458                                 break;
459                         }
460
461                         in += 2;
462                         ++out1;
463                         ++out2;
464                 }
465                 current_frame->len += (uint8_t *)in - aligned_start;
466         } else {
467                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
468                 while (in < (const __m256i *)limit) {
469                         __m256i data = _mm256_load_si256(in);
470                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
471                         __m256i found = _mm256_cmpeq_epi8(data, needle);
472                         if (!_mm256_testz_si256(found, found)) {
473                                 break;
474                         }
475
476                         ++in;
477                         ++out;
478                 }
479                 current_frame->len = (uint8_t *)out - current_frame->data;
480         }
481 #else
482         const __m128i needle = _mm_set1_epi8(sync_char);
483
484         const __m128i *in = (const __m128i *)aligned_start;
485         if (current_frame->interleaved) {
486                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
487                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
488                 if (current_frame->len % 2 == 1) {
489                         swap(out1, out2);
490                 }
491
492                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
493                 while (in < (const __m128i *)limit) {
494                         __m128i data1 = _mm_load_si128(in);
495                         __m128i data2 = _mm_load_si128(in + 1);
496                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
497                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
498                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
499                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
500                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
501                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
502                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
503                         _mm_storeu_si128(out2, hi);
504                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
505                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
506                         if (!_mm_testz_si128(found1, found1) ||
507                             !_mm_testz_si128(found2, found2)) {
508                                 break;
509                         }
510
511                         in += 2;
512                         ++out1;
513                         ++out2;
514                 }
515                 current_frame->len += (uint8_t *)in - aligned_start;
516         } else {
517                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
518                 while (in < (const __m128i *)limit) {
519                         __m128i data = _mm_load_si128(in);
520                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
521                         __m128i found = _mm_cmpeq_epi8(data, needle);
522                         if (!_mm_testz_si128(found, found)) {
523                                 break;
524                         }
525
526                         ++in;
527                         ++out;
528                 }
529                 current_frame->len = (uint8_t *)out - current_frame->data;
530         }
531 #endif
532
533         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
534
535         return (const uint8_t *)in;
536 }
537 #endif
538
539 void decode_packs(const libusb_transfer *xfr,
540                   const char *sync_pattern,
541                   int sync_length,
542                   FrameAllocator::Frame *current_frame,
543                   const char *frame_type_name,
544                   function<void(const uint8_t *start)> start_callback)
545 {
546         int offset = 0;
547         for (int i = 0; i < xfr->num_iso_packets; i++) {
548                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
549
550                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
551                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
552                         continue;
553 //exit(5);
554                 }
555
556                 const uint8_t *start = xfr->buffer + offset;
557                 const uint8_t *limit = start + pack->actual_length;
558                 while (start < limit) {  // Usually runs only one iteration.
559 #ifdef __SSE4_1__
560                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
561                         if (start == limit) break;
562                         assert(start < limit);
563 #endif
564
565                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
566                         if (start_next_frame == nullptr) {
567                                 // add the rest of the buffer
568                                 add_to_frame(current_frame, frame_type_name, start, limit);
569                                 break;
570                         } else {
571                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
572                                 start = start_next_frame + sync_length;  // skip sync
573                                 start_callback(start);
574                         }
575                 }
576 #if 0
577                 dump_pack(xfr, offset, pack);
578 #endif
579                 offset += pack->length;
580         }
581 }
582
583 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
584 {
585         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
586                 fprintf(stderr, "transfer status %d\n", xfr->status);
587                 libusb_free_transfer(xfr);
588                 exit(3);
589         }
590
591         assert(xfr->user_data != nullptr);
592         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
593
594         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
595                 if (xfr->endpoint == 0x84) {
596                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
597                 } else {
598                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
599                 }
600         }
601         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
602                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
603                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
604 #if 0
605                 if (setup->wIndex == 44) {
606                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
607                 } else {
608                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
609                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
610                 }
611 #else
612                 memcpy(usb->register_file + usb->current_register, buf, 4);
613                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
614                 if (usb->current_register == 0) {
615                         // read through all of them
616                         printf("register dump:");
617                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
618                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
619                         }
620                         printf("\n");
621                 }
622                 libusb_fill_control_setup(xfr->buffer,
623                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
624                         /*index=*/usb->current_register, /*length=*/4);
625 #endif
626         }
627
628 #if 0
629         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
630         for (i = 0; i < xfr->actual_length; i++) {
631                 printf("%02x", xfr->buffer[i]);
632                 if (i % 16)
633                         printf("\n");
634                 else if (i % 8)
635                         printf("  ");
636                 else
637                         printf(" ");
638         }
639 #endif
640
641         int rc = libusb_submit_transfer(xfr);
642         if (rc < 0) {
643                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
644                 exit(1);
645         }
646 }
647
648 void BMUSBCapture::usb_thread_func()
649 {
650         sched_param param;
651         memset(&param, 0, sizeof(param));
652         param.sched_priority = 1;
653         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
654                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
655         }
656         while (!should_quit) {
657                 int rc = libusb_handle_events(nullptr);
658                 if (rc != LIBUSB_SUCCESS)
659                         break;
660         }
661 }
662
663 struct USBCardDevice {
664         uint16_t product;
665         uint8_t bus, port;
666         libusb_device *device;
667 };
668
669 libusb_device_handle *open_card(int card_index)
670 {       
671         libusb_device **devices;
672         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
673         if (num_devices == -1) {
674                 fprintf(stderr, "Error finding USB devices\n");
675                 exit(1);
676         }
677         vector<USBCardDevice> found_cards;
678         for (ssize_t i = 0; i < num_devices; ++i) {
679                 libusb_device_descriptor desc;
680                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
681                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
682                         exit(1);
683                 }
684
685                 uint8_t bus = libusb_get_bus_number(devices[i]);
686                 uint8_t port = libusb_get_port_number(devices[i]);
687
688                 if (!(desc.idVendor == 0x1edb && desc.idProduct == 0xbd3b) &&
689                     !(desc.idVendor == 0x1edb && desc.idProduct == 0xbd4f)) {
690                         libusb_unref_device(devices[i]);
691                         continue;
692                 }
693
694                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
695         }
696         libusb_free_device_list(devices, 0);
697
698         // Sort the devices to get a consistent ordering.
699         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
700                 if (a.product != b.product)
701                         return a.product < b.product;
702                 if (a.bus != b.bus)
703                         return a.bus < b.bus;
704                 return a.port < b.port;
705         });
706
707         for (size_t i = 0; i < found_cards.size(); ++i) {
708                 fprintf(stderr, "Card %d: Bus %03u Device %03u ", int(i), found_cards[i].bus, found_cards[i].port);
709                 if (found_cards[i].product == 0xbd3b) {
710                         fprintf(stderr, "Intensity Shuttle\n");
711                 } else if (found_cards[i].product == 0xbd4f) {
712                         fprintf(stderr, "UltraStudio SDI\n");
713                 } else {
714                         assert(false);
715                 }
716         }
717
718         if (size_t(card_index) >= found_cards.size()) {
719                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
720                 exit(1);
721         }
722
723         libusb_device_handle *devh;
724         int rc = libusb_open(found_cards[card_index].device, &devh);
725         if (rc < 0) {
726                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
727                 exit(1);
728         }
729
730         for (size_t i = 0; i < found_cards.size(); ++i) {
731                 libusb_unref_device(found_cards[i].device);
732         }
733
734         return devh;
735 }
736
737 void BMUSBCapture::configure_card()
738 {
739         if (video_frame_allocator == nullptr) {
740                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));  // FIXME: leak.
741         }
742         if (audio_frame_allocator == nullptr) {
743                 set_audio_frame_allocator(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));  // FIXME: leak.
744         }
745         dequeue_thread_should_quit = false;
746         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
747
748         int rc;
749         struct libusb_transfer *xfr;
750
751         rc = libusb_init(nullptr);
752         if (rc < 0) {
753                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
754                 exit(1);
755         }
756
757         libusb_device_handle *devh = open_card(card_index);
758         if (!devh) {
759                 fprintf(stderr, "Error finding USB device\n");
760                 exit(1);
761         }
762
763         libusb_config_descriptor *config;
764         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
765         if (rc < 0) {
766                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
767                 exit(1);
768         }
769         printf("%d interface\n", config->bNumInterfaces);
770         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
771                 printf("  interface %d\n", interface_number);
772                 const libusb_interface *interface = &config->interface[interface_number];
773                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
774                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
775                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
776                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
777                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
778                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
779                         }
780                 }
781         }
782
783         rc = libusb_set_configuration(devh, /*configuration=*/1);
784         if (rc < 0) {
785                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
786                 exit(1);
787         }
788
789         rc = libusb_claim_interface(devh, 0);
790         if (rc < 0) {
791                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
792                 exit(1);
793         }
794
795         // Alternate setting 1 is output, alternate setting 2 is input.
796         // Card is reset when switching alternates, so the driver uses
797         // this “double switch” when it wants to reset.
798         //
799         // There's also alternate settings 3 and 4, which seem to be
800         // like 1 and 2 except they advertise less bandwidth needed.
801         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
802         if (rc < 0) {
803                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
804                 exit(1);
805         }
806         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
807         if (rc < 0) {
808                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
809                 exit(1);
810         }
811 #if 0
812         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
813         if (rc < 0) {
814                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
815                 exit(1);
816         }
817 #endif
818
819 #if 0
820         rc = libusb_claim_interface(devh, 3);
821         if (rc < 0) {
822                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
823                 exit(1);
824         }
825 #endif
826
827         // theories:
828         //   44 is some kind of timer register (first 16 bits count upwards)
829         //   24 is some sort of watchdog?
830         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
831         //      (or will go to 0x73c60010?), also seen 0x73c60100
832         //   12 also changes all the time, unclear why  
833         //   16 seems to be autodetected mode somehow
834         //      --    this is e00115e0 after reset?
835         //                    ed0115e0 after mode change [to output?]
836         //                    2d0015e0 after more mode change [to input]
837         //                    ed0115e0 after more mode change
838         //                    2d0015e0 after more mode change
839         //
840         //                    390115e0 seems to indicate we have signal
841         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
842         //
843         //                    200015e0 on startup
844         //         changes to 250115e0 when we sync to the signal
845         //
846         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
847         //
848         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
849         //
850         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
851         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
852         //
853         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
854         //    perhaps some of them are related to analog output?
855         //
856         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
857         //    but the driver sets it to 0x8036802a at some point.
858         //
859         //    all of this is on request 214/215. other requests (192, 219,
860         //    222, 223, 224) are used for firmware upgrade. Probably best to
861         //    stay out of it unless you know what you're doing.
862         //
863         //
864         // register 16:
865         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
866         //
867         // theories:
868         //   0x01 - stable signal
869         //   0x04 - deep color
870         //   0x08 - unknown (audio??)
871         //   0x20 - 720p??
872         //   0x30 - 576p??
873
874         struct ctrl {
875                 int endpoint;
876                 int request;
877                 int index;
878                 uint32_t data;
879         };
880         static const ctrl ctrls[] = {
881                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
882                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
883
884                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
885                 // capture (v210).
886                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
887                 // 0x10000000 = analog audio instead of embedded audio, it seems
888                 // 0x3a000000 = component video? (analog audio)
889                 // 0x3c000000 = composite video? (analog audio)
890                 // 0x3e000000 = s-video? (analog audio)
891                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
892                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
893                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
894                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
895                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
896         };
897
898         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
899                 uint32_t flipped = htonl(ctrls[req].data);
900                 static uint8_t value[4];
901                 memcpy(value, &flipped, sizeof(flipped));
902                 int size = sizeof(value);
903                 //if (ctrls[req].request == 215) size = 0;
904                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
905                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
906                 if (rc < 0) {
907                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
908                         exit(1);
909                 }
910                 
911                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
912                 for (int i = 0; i < rc; ++i) {
913                         printf("%02x", value[i]);
914                 }
915                 printf("\n");
916         }
917
918 #if 0
919         // DEBUG
920         for ( ;; ) {
921                 static int my_index = 0;
922                 static uint8_t value[4];
923                 int size = sizeof(value);
924                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
925                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
926                 if (rc < 0) {
927                         fprintf(stderr, "Error on control\n");
928                         exit(1);
929                 }
930                 printf("rc=%d index=%d: 0x", rc, my_index);
931                 for (int i = 0; i < rc; ++i) {
932                         printf("%02x", value[i]);
933                 }
934                 printf("\n");
935         }
936 #endif
937
938 #if 0
939         // set up an asynchronous transfer of the timer register
940         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
941         static int completed = 0;
942
943         xfr = libusb_alloc_transfer(0);
944         libusb_fill_control_setup(cmdbuf,
945             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
946                 /*index=*/44, /*length=*/4);
947         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
948         xfr->user_data = this;
949         libusb_submit_transfer(xfr);
950
951         // set up an asynchronous transfer of register 24
952         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
953         static int completed2 = 0;
954
955         xfr = libusb_alloc_transfer(0);
956         libusb_fill_control_setup(cmdbuf2,
957             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
958                 /*index=*/24, /*length=*/4);
959         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
960         xfr->user_data = this;
961         libusb_submit_transfer(xfr);
962 #endif
963
964         // set up an asynchronous transfer of the register dump
965         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
966         static int completed3 = 0;
967
968         xfr = libusb_alloc_transfer(0);
969         libusb_fill_control_setup(cmdbuf3,
970             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
971                 /*index=*/current_register, /*length=*/4);
972         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
973         xfr->user_data = this;
974         //libusb_submit_transfer(xfr);
975
976         audiofp = fopen("audio.raw", "wb");
977
978         // set up isochronous transfers for audio and video
979         for (int e = 3; e <= 4; ++e) {
980                 //int num_transfers = (e == 3) ? 6 : 6;
981                 int num_transfers = 10;
982                 for (int i = 0; i < num_transfers; ++i) {
983                         int num_iso_pack, size;
984                         if (e == 3) {
985                                 // Video seems to require isochronous packets scaled with the width; 
986                                 // seemingly six lines is about right, rounded up to the required 1kB
987                                 // multiple.
988                                 size = WIDTH * 2 * 6;
989                                 // Note that for 10-bit input, you'll need to increase size accordingly.
990                                 //size = size * 4 / 3;
991                                 if (size % 1024 != 0) {
992                                         size &= ~1023;
993                                         size += 1024;
994                                 }
995                                 num_iso_pack = (2 << 16) / size;  // 128 kB.
996                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
997                         } else {
998                                 size = 0xc0;
999                                 num_iso_pack = 80;
1000                         }
1001                         int num_bytes = num_iso_pack * size;
1002                         uint8_t *buf = new uint8_t[num_bytes];
1003
1004                         xfr = libusb_alloc_transfer(num_iso_pack);
1005                         if (!xfr) {
1006                                 fprintf(stderr, "oom\n");
1007                                 exit(1);
1008                         }
1009
1010                         int ep = LIBUSB_ENDPOINT_IN | e;
1011                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
1012                                 num_iso_pack, cb_xfr, nullptr, 0);
1013                         libusb_set_iso_packet_lengths(xfr, size);
1014                         xfr->user_data = this;
1015                         iso_xfrs.push_back(xfr);
1016                 }
1017         }
1018 }
1019
1020 void BMUSBCapture::start_bm_capture()
1021 {
1022         printf("starting capture\n");
1023         int i = 0;
1024         for (libusb_transfer *xfr : iso_xfrs) {
1025                 printf("submitting transfer...\n");
1026                 int rc = libusb_submit_transfer(xfr);
1027                 ++i;
1028                 if (rc < 0) {
1029                         //printf("num_bytes=%d\n", num_bytes);
1030                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1031                                 xfr->endpoint, i, libusb_error_name(rc));
1032                         exit(1);
1033                 }
1034         }
1035
1036
1037 #if 0
1038         libusb_release_interface(devh, 0);
1039 out:
1040         if (devh)
1041                 libusb_close(devh);
1042         libusb_exit(nullptr);
1043         return rc;
1044 #endif
1045 }
1046
1047 void BMUSBCapture::stop_dequeue_thread()
1048 {
1049         dequeue_thread_should_quit = true;
1050         queues_not_empty.notify_all();
1051         dequeue_thread.join();
1052 }
1053
1054 void BMUSBCapture::start_bm_thread()
1055 {
1056         should_quit = false;
1057         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1058 }
1059
1060 void BMUSBCapture::stop_bm_thread()
1061 {
1062         should_quit = true;
1063         usb_thread.join();
1064 }