]> git.sesse.net Git - bmusb/blob - bmusb.cpp
4621e4b82880ea50189ecbcb07127205683d417f
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 // Audio is more important than video, and also much cheaper.
56 // By having many more audio frames available, hopefully if something
57 // starts to drop, we'll have CPU load go down (from not having to
58 // process as much video) before we have to drop audio.
59 #define NUM_QUEUED_VIDEO_FRAMES 16
60 #define NUM_QUEUED_AUDIO_FRAMES 64
61
62 class MallocFrameAllocator : public FrameAllocator {
63 public:
64         MallocFrameAllocator(size_t frame_size, size_t num_queued_frames);
65         Frame alloc_frame() override;
66         void release_frame(Frame frame) override;
67
68 private:
69         size_t frame_size;
70
71         mutex freelist_mutex;
72         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
73 };
74
75 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
76         : frame_size(frame_size)
77 {
78         for (size_t i = 0; i < num_queued_frames; ++i) {
79                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
80         }
81 }
82
83 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
84 {
85         Frame vf;
86         vf.owner = this;
87
88         unique_lock<mutex> lock(freelist_mutex);  // Meh.
89         if (freelist.empty()) {
90                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
91                         frame_size);
92         } else {
93                 vf.data = freelist.top().release();
94                 vf.size = frame_size;
95                 freelist.pop();  // Meh.
96         }
97         return vf;
98 }
99
100 void MallocFrameAllocator::release_frame(Frame frame)
101 {
102         if (frame.overflow > 0) {
103                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
104         }
105         unique_lock<mutex> lock(freelist_mutex);
106         freelist.push(unique_ptr<uint8_t[]>(frame.data));
107 }
108
109 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
110 {
111         if (a == b) {
112                 return false;
113         } else if (a < b) {
114                 return (b - a < 0x8000);
115         } else {
116                 int wrap_b = 0x10000 + int(b);
117                 return (wrap_b - a < 0x8000);
118         }
119 }
120
121 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
122 {
123         unique_lock<mutex> lock(queue_lock);
124         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
125                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
126                         q->back().timecode, timecode);
127                 frame.owner->release_frame(frame);
128                 return;
129         }
130
131         QueuedFrame qf;
132         qf.format = format;
133         qf.timecode = timecode;
134         qf.frame = frame;
135         q->push_back(move(qf));
136         queues_not_empty.notify_one();  // might be spurious
137 }
138
139 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
140 {
141         FILE *fp = fopen(filename, "wb");
142         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
143                 printf("short write!\n");
144         }
145         fclose(fp);
146 }
147
148 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
149 {
150         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
151 }
152
153 void BMUSBCapture::dequeue_thread_func()
154 {
155         if (has_dequeue_callbacks) {
156                 dequeue_init_callback();
157         }
158         while (!dequeue_thread_should_quit) {
159                 unique_lock<mutex> lock(queue_lock);
160                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
161
162                 if (dequeue_thread_should_quit) break;
163
164                 uint16_t video_timecode = pending_video_frames.front().timecode;
165                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
166                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
167                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
168                                 video_timecode);
169                         QueuedFrame video_frame = pending_video_frames.front();
170                         pending_video_frames.pop_front();
171                         lock.unlock();
172                         video_frame_allocator->release_frame(video_frame.frame);
173                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
174                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
175                                 audio_timecode);
176                         QueuedFrame audio_frame = pending_audio_frames.front();
177                         pending_audio_frames.pop_front();
178                         lock.unlock();
179                         frame_callback(audio_timecode,
180                                        FrameAllocator::Frame(), 0, 0x0000,
181                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
182                 } else {
183                         QueuedFrame video_frame = pending_video_frames.front();
184                         QueuedFrame audio_frame = pending_audio_frames.front();
185                         pending_audio_frames.pop_front();
186                         pending_video_frames.pop_front();
187                         lock.unlock();
188
189 #if 0
190                         char filename[255];
191                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
192                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
193                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
194 #endif
195
196                         frame_callback(video_timecode,
197                                        video_frame.frame, HEADER_SIZE, video_frame.format,
198                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
199                 }
200         }
201         if (has_dequeue_callbacks) {
202                 dequeue_cleanup_callback();
203         }
204 }
205
206 void BMUSBCapture::start_new_frame(const uint8_t *start)
207 {
208         uint16_t format = (start[3] << 8) | start[2];
209         uint16_t timecode = (start[1] << 8) | start[0];
210
211         if (current_video_frame.len > 0) {
212                 // If format is 0x0800 (no signal), add a fake (empty) audio
213                 // frame to get it out of the queue.
214                 // TODO: Figure out if there are other formats that come with
215                 // no audio, and treat them the same.
216                 if (format == 0x0800) {
217                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
218                         if (fake_audio_frame.data == nullptr) {
219                                 // Oh well, it's just a no-signal frame anyway.
220                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
221                                 current_video_frame.owner->release_frame(current_video_frame);
222                                 current_video_frame = video_frame_allocator->alloc_frame();
223                                 return;
224                         }
225                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
226                 }
227                 //dump_frame();
228                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
229         }
230         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
231         //      format, timecode,
232         //      //start[7], start[6], start[5], start[4],
233         //      read_current_frame, FRAME_SIZE);
234
235         current_video_frame = video_frame_allocator->alloc_frame();
236         //if (current_video_frame.data == nullptr) {
237         //      read_current_frame = -1;
238         //} else {
239         //      read_current_frame = 0;
240         //}
241 }
242
243 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
244 {
245         uint16_t format = (start[3] << 8) | start[2];
246         uint16_t timecode = (start[1] << 8) | start[0];
247         if (current_audio_frame.len > 0) {
248                 //dump_audio_block();
249                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
250         }
251         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
252         //      format, timecode, read_current_audio_block);
253         current_audio_frame = audio_frame_allocator->alloc_frame();
254 }
255
256 #if 0
257 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
258 {
259         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
260         for (unsigned j = 0; j < pack->actual_length; j++) {
261         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
262                 printf("%02x", xfr->buffer[j + offset]);
263                 if ((j % 16) == 15)
264                         printf("\n");
265                 else if ((j % 8) == 7)
266                         printf("  ");
267                 else
268                         printf(" ");
269         }
270 }
271 #endif
272
273 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
274 {
275         assert(n % 2 == 0);
276         uint8_t *dptr1 = dest1;
277         uint8_t *dptr2 = dest2;
278
279         for (size_t i = 0; i < n; i += 2) {
280                 *dptr1++ = *src++;
281                 *dptr2++ = *src++;
282         }
283 }
284
285 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
286 {
287         if (current_frame->data == nullptr ||
288             current_frame->len > current_frame->size ||
289             start == end) {
290                 return;
291         }
292
293         int bytes = end - start;
294         if (current_frame->len + bytes > current_frame->size) {
295                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
296                 current_frame->len = current_frame->size;
297                 if (current_frame->overflow > 1048576) {
298                         printf("%d bytes overflow after last %s frame\n",
299                                 int(current_frame->overflow), frame_type_name);
300                         current_frame->overflow = 0;
301                 }
302                 //dump_frame();
303         } else {
304                 if (current_frame->interleaved) {
305                         uint8_t *data = current_frame->data + current_frame->len / 2;
306                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
307                         if (current_frame->len % 2 == 1) {
308                                 ++data;
309                                 swap(data, data2);
310                         }
311                         if (bytes % 2 == 1) {
312                                 *data++ = *start++;
313                                 swap(data, data2);
314                                 ++current_frame->len;
315                                 --bytes;
316                         }
317                         memcpy_interleaved(data, data2, start, bytes);
318                         current_frame->len += bytes;
319                 } else {
320                         memcpy(current_frame->data + current_frame->len, start, bytes);
321                         current_frame->len += bytes;
322                 }
323         }
324 }
325
326 #ifdef __SSE4_1__
327
328 #if 0
329 void avx2_dump(const char *name, __m256i n)
330 {
331         printf("%-10s:", name);
332         printf(" %02x", _mm256_extract_epi8(n, 0));
333         printf(" %02x", _mm256_extract_epi8(n, 1));
334         printf(" %02x", _mm256_extract_epi8(n, 2));
335         printf(" %02x", _mm256_extract_epi8(n, 3));
336         printf(" %02x", _mm256_extract_epi8(n, 4));
337         printf(" %02x", _mm256_extract_epi8(n, 5));
338         printf(" %02x", _mm256_extract_epi8(n, 6));
339         printf(" %02x", _mm256_extract_epi8(n, 7));
340         printf(" ");
341         printf(" %02x", _mm256_extract_epi8(n, 8));
342         printf(" %02x", _mm256_extract_epi8(n, 9));
343         printf(" %02x", _mm256_extract_epi8(n, 10));
344         printf(" %02x", _mm256_extract_epi8(n, 11));
345         printf(" %02x", _mm256_extract_epi8(n, 12));
346         printf(" %02x", _mm256_extract_epi8(n, 13));
347         printf(" %02x", _mm256_extract_epi8(n, 14));
348         printf(" %02x", _mm256_extract_epi8(n, 15));
349         printf(" ");
350         printf(" %02x", _mm256_extract_epi8(n, 16));
351         printf(" %02x", _mm256_extract_epi8(n, 17));
352         printf(" %02x", _mm256_extract_epi8(n, 18));
353         printf(" %02x", _mm256_extract_epi8(n, 19));
354         printf(" %02x", _mm256_extract_epi8(n, 20));
355         printf(" %02x", _mm256_extract_epi8(n, 21));
356         printf(" %02x", _mm256_extract_epi8(n, 22));
357         printf(" %02x", _mm256_extract_epi8(n, 23));
358         printf(" ");
359         printf(" %02x", _mm256_extract_epi8(n, 24));
360         printf(" %02x", _mm256_extract_epi8(n, 25));
361         printf(" %02x", _mm256_extract_epi8(n, 26));
362         printf(" %02x", _mm256_extract_epi8(n, 27));
363         printf(" %02x", _mm256_extract_epi8(n, 28));
364         printf(" %02x", _mm256_extract_epi8(n, 29));
365         printf(" %02x", _mm256_extract_epi8(n, 30));
366         printf(" %02x", _mm256_extract_epi8(n, 31));
367         printf("\n");
368 }
369 #endif
370
371 // Does a memcpy and memchr in one to reduce processing time.
372 // Note that the benefit is somewhat limited if your L3 cache is small,
373 // as you'll (unfortunately) spend most of the time loading the data
374 // from main memory.
375 //
376 // Complicated cases are left to the slow path; it basically stops copying
377 // up until the first instance of "sync_char" (usually a bit before, actually).
378 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
379 // data, and what we really need this for is the 00 00 ff ff marker in video data.
380 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
381 {
382         if (current_frame->data == nullptr ||
383             current_frame->len > current_frame->size ||
384             start == limit) {
385                 return start;
386         }
387         size_t orig_bytes = limit - start;
388         if (orig_bytes < 128) {
389                 // Don't bother.
390                 return start;
391         }
392
393         // Don't read more bytes than we can write.
394         limit = min(limit, start + (current_frame->size - current_frame->len));
395
396         // Align end to 32 bytes.
397         limit = (const uint8_t *)(intptr_t(limit) & ~31);
398
399         if (start >= limit) {
400                 return start;
401         }
402
403         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
404         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
405         if (aligned_start != start) {
406                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
407                 if (sync_start == nullptr) {
408                         add_to_frame(current_frame, "", start, aligned_start);
409                 } else {
410                         add_to_frame(current_frame, "", start, sync_start);
411                         return sync_start;
412                 }
413         }
414
415         // Make the length a multiple of 64.
416         if (current_frame->interleaved) {
417                 if (((limit - aligned_start) % 64) != 0) {
418                         limit -= 32;
419                 }
420                 assert(((limit - aligned_start) % 64) == 0);
421         }
422
423 #if __AVX2__
424         const __m256i needle = _mm256_set1_epi8(sync_char);
425
426         const __restrict __m256i *in = (const __m256i *)aligned_start;
427         if (current_frame->interleaved) {
428                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
429                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
430                 if (current_frame->len % 2 == 1) {
431                         swap(out1, out2);
432                 }
433
434                 __m256i shuffle_cw = _mm256_set_epi8(
435                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
436                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
437                 while (in < (const __m256i *)limit) {
438                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
439                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
440                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
441
442                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
443                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
444                         __m256i found = _mm256_or_si256(found1, found2);
445
446                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
447                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
448                 
449                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
450                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
451
452                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
453                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
454
455                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
456                         _mm256_storeu_si256(out2, hi);
457
458                         if (!_mm256_testz_si256(found, found)) {
459                                 break;
460                         }
461
462                         in += 2;
463                         ++out1;
464                         ++out2;
465                 }
466                 current_frame->len += (uint8_t *)in - aligned_start;
467         } else {
468                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
469                 while (in < (const __m256i *)limit) {
470                         __m256i data = _mm256_load_si256(in);
471                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
472                         __m256i found = _mm256_cmpeq_epi8(data, needle);
473                         if (!_mm256_testz_si256(found, found)) {
474                                 break;
475                         }
476
477                         ++in;
478                         ++out;
479                 }
480                 current_frame->len = (uint8_t *)out - current_frame->data;
481         }
482 #else
483         const __m128i needle = _mm_set1_epi8(sync_char);
484
485         const __m128i *in = (const __m128i *)aligned_start;
486         if (current_frame->interleaved) {
487                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
488                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
489                 if (current_frame->len % 2 == 1) {
490                         swap(out1, out2);
491                 }
492
493                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
494                 while (in < (const __m128i *)limit) {
495                         __m128i data1 = _mm_load_si128(in);
496                         __m128i data2 = _mm_load_si128(in + 1);
497                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
498                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
499                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
500                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
501                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
502                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
503                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
504                         _mm_storeu_si128(out2, hi);
505                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
506                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
507                         if (!_mm_testz_si128(found1, found1) ||
508                             !_mm_testz_si128(found2, found2)) {
509                                 break;
510                         }
511
512                         in += 2;
513                         ++out1;
514                         ++out2;
515                 }
516                 current_frame->len += (uint8_t *)in - aligned_start;
517         } else {
518                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
519                 while (in < (const __m128i *)limit) {
520                         __m128i data = _mm_load_si128(in);
521                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
522                         __m128i found = _mm_cmpeq_epi8(data, needle);
523                         if (!_mm_testz_si128(found, found)) {
524                                 break;
525                         }
526
527                         ++in;
528                         ++out;
529                 }
530                 current_frame->len = (uint8_t *)out - current_frame->data;
531         }
532 #endif
533
534         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
535
536         return (const uint8_t *)in;
537 }
538 #endif
539
540 void decode_packs(const libusb_transfer *xfr,
541                   const char *sync_pattern,
542                   int sync_length,
543                   FrameAllocator::Frame *current_frame,
544                   const char *frame_type_name,
545                   function<void(const uint8_t *start)> start_callback)
546 {
547         int offset = 0;
548         for (int i = 0; i < xfr->num_iso_packets; i++) {
549                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
550
551                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
552                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
553                         continue;
554 //exit(5);
555                 }
556
557                 const uint8_t *start = xfr->buffer + offset;
558                 const uint8_t *limit = start + pack->actual_length;
559                 while (start < limit) {  // Usually runs only one iteration.
560 #ifdef __SSE4_1__
561                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
562                         if (start == limit) break;
563                         assert(start < limit);
564 #endif
565
566                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
567                         if (start_next_frame == nullptr) {
568                                 // add the rest of the buffer
569                                 add_to_frame(current_frame, frame_type_name, start, limit);
570                                 break;
571                         } else {
572                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
573                                 start = start_next_frame + sync_length;  // skip sync
574                                 start_callback(start);
575                         }
576                 }
577 #if 0
578                 dump_pack(xfr, offset, pack);
579 #endif
580                 offset += pack->length;
581         }
582 }
583
584 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
585 {
586         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
587                 fprintf(stderr, "transfer status %d\n", xfr->status);
588                 libusb_free_transfer(xfr);
589                 exit(3);
590         }
591
592         assert(xfr->user_data != nullptr);
593         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
594
595         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
596                 if (xfr->endpoint == 0x84) {
597                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
598                 } else {
599                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
600                 }
601         }
602         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
603                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
604                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
605 #if 0
606                 if (setup->wIndex == 44) {
607                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
608                 } else {
609                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
610                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
611                 }
612 #else
613                 memcpy(usb->register_file + usb->current_register, buf, 4);
614                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
615                 if (usb->current_register == 0) {
616                         // read through all of them
617                         printf("register dump:");
618                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
619                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
620                         }
621                         printf("\n");
622                 }
623                 libusb_fill_control_setup(xfr->buffer,
624                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
625                         /*index=*/usb->current_register, /*length=*/4);
626 #endif
627         }
628
629 #if 0
630         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
631         for (i = 0; i < xfr->actual_length; i++) {
632                 printf("%02x", xfr->buffer[i]);
633                 if (i % 16)
634                         printf("\n");
635                 else if (i % 8)
636                         printf("  ");
637                 else
638                         printf(" ");
639         }
640 #endif
641
642         int rc = libusb_submit_transfer(xfr);
643         if (rc < 0) {
644                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
645                 exit(1);
646         }
647 }
648
649 void BMUSBCapture::usb_thread_func()
650 {
651         sched_param param;
652         memset(&param, 0, sizeof(param));
653         param.sched_priority = 1;
654         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
655                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
656         }
657         while (!should_quit) {
658                 int rc = libusb_handle_events(nullptr);
659                 if (rc != LIBUSB_SUCCESS)
660                         break;
661         }
662 }
663
664 struct USBCardDevice {
665         uint16_t product;
666         uint8_t bus, port;
667         libusb_device *device;
668 };
669
670 libusb_device_handle *open_card(int card_index)
671 {       
672         libusb_device **devices;
673         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
674         if (num_devices == -1) {
675                 fprintf(stderr, "Error finding USB devices\n");
676                 exit(1);
677         }
678         vector<USBCardDevice> found_cards;
679         for (ssize_t i = 0; i < num_devices; ++i) {
680                 libusb_device_descriptor desc;
681                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
682                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
683                         exit(1);
684                 }
685
686                 uint8_t bus = libusb_get_bus_number(devices[i]);
687                 uint8_t port = libusb_get_port_number(devices[i]);
688
689                 if (!(desc.idVendor == 0x1edb && desc.idProduct == 0xbd3b) &&
690                     !(desc.idVendor == 0x1edb && desc.idProduct == 0xbd4f)) {
691                         libusb_unref_device(devices[i]);
692                         continue;
693                 }
694
695                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
696         }
697         libusb_free_device_list(devices, 0);
698
699         // Sort the devices to get a consistent ordering.
700         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
701                 if (a.product != b.product)
702                         return a.product < b.product;
703                 if (a.bus != b.bus)
704                         return a.bus < b.bus;
705                 return a.port < b.port;
706         });
707
708         for (size_t i = 0; i < found_cards.size(); ++i) {
709                 fprintf(stderr, "Card %d: Bus %03u Device %03u ", int(i), found_cards[i].bus, found_cards[i].port);
710                 if (found_cards[i].product == 0xbd3b) {
711                         fprintf(stderr, "Intensity Shuttle\n");
712                 } else if (found_cards[i].product == 0xbd4f) {
713                         fprintf(stderr, "UltraStudio SDI\n");
714                 } else {
715                         assert(false);
716                 }
717         }
718
719         if (size_t(card_index) >= found_cards.size()) {
720                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
721                 exit(1);
722         }
723
724         libusb_device_handle *devh;
725         int rc = libusb_open(found_cards[card_index].device, &devh);
726         if (rc < 0) {
727                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
728                 exit(1);
729         }
730
731         for (size_t i = 0; i < found_cards.size(); ++i) {
732                 libusb_unref_device(found_cards[i].device);
733         }
734
735         return devh;
736 }
737
738 void BMUSBCapture::configure_card()
739 {
740         if (video_frame_allocator == nullptr) {
741                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));  // FIXME: leak.
742         }
743         if (audio_frame_allocator == nullptr) {
744                 set_audio_frame_allocator(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));  // FIXME: leak.
745         }
746         dequeue_thread_should_quit = false;
747         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
748
749         int rc;
750         struct libusb_transfer *xfr;
751
752         rc = libusb_init(nullptr);
753         if (rc < 0) {
754                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
755                 exit(1);
756         }
757
758         libusb_device_handle *devh = open_card(card_index);
759         if (!devh) {
760                 fprintf(stderr, "Error finding USB device\n");
761                 exit(1);
762         }
763
764         libusb_config_descriptor *config;
765         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
766         if (rc < 0) {
767                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
768                 exit(1);
769         }
770         printf("%d interface\n", config->bNumInterfaces);
771         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
772                 printf("  interface %d\n", interface_number);
773                 const libusb_interface *interface = &config->interface[interface_number];
774                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
775                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
776                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
777                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
778                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
779                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
780                         }
781                 }
782         }
783
784         rc = libusb_set_configuration(devh, /*configuration=*/1);
785         if (rc < 0) {
786                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
787                 exit(1);
788         }
789
790         rc = libusb_claim_interface(devh, 0);
791         if (rc < 0) {
792                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
793                 exit(1);
794         }
795
796         // Alternate setting 1 is output, alternate setting 2 is input.
797         // Card is reset when switching alternates, so the driver uses
798         // this “double switch” when it wants to reset.
799         //
800         // There's also alternate settings 3 and 4, which seem to be
801         // like 1 and 2 except they advertise less bandwidth needed.
802         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
803         if (rc < 0) {
804                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
805                 exit(1);
806         }
807         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
808         if (rc < 0) {
809                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
810                 exit(1);
811         }
812 #if 0
813         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
814         if (rc < 0) {
815                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
816                 exit(1);
817         }
818 #endif
819
820 #if 0
821         rc = libusb_claim_interface(devh, 3);
822         if (rc < 0) {
823                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
824                 exit(1);
825         }
826 #endif
827
828         // theories:
829         //   44 is some kind of timer register (first 16 bits count upwards)
830         //   24 is some sort of watchdog?
831         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
832         //      (or will go to 0x73c60010?), also seen 0x73c60100
833         //   12 also changes all the time, unclear why  
834         //   16 seems to be autodetected mode somehow
835         //      --    this is e00115e0 after reset?
836         //                    ed0115e0 after mode change [to output?]
837         //                    2d0015e0 after more mode change [to input]
838         //                    ed0115e0 after more mode change
839         //                    2d0015e0 after more mode change
840         //
841         //                    390115e0 seems to indicate we have signal
842         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
843         //
844         //                    200015e0 on startup
845         //         changes to 250115e0 when we sync to the signal
846         //
847         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
848         //
849         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
850         //
851         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
852         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
853         //
854         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
855         //    perhaps some of them are related to analog output?
856         //
857         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
858         //    but the driver sets it to 0x8036802a at some point.
859         //
860         //    all of this is on request 214/215. other requests (192, 219,
861         //    222, 223, 224) are used for firmware upgrade. Probably best to
862         //    stay out of it unless you know what you're doing.
863         //
864         //
865         // register 16:
866         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
867         //
868         // theories:
869         //   0x01 - stable signal
870         //   0x04 - deep color
871         //   0x08 - unknown (audio??)
872         //   0x20 - 720p??
873         //   0x30 - 576p??
874
875         struct ctrl {
876                 int endpoint;
877                 int request;
878                 int index;
879                 uint32_t data;
880         };
881         static const ctrl ctrls[] = {
882                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
883                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
884
885                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
886                 // capture (v210).
887                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
888                 // 0x10000000 = analog audio instead of embedded audio, it seems
889                 // 0x3a000000 = component video? (analog audio)
890                 // 0x3c000000 = composite video? (analog audio)
891                 // 0x3e000000 = s-video? (analog audio)
892                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
893                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
894                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
895                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
896                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
897         };
898
899         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
900                 uint32_t flipped = htonl(ctrls[req].data);
901                 static uint8_t value[4];
902                 memcpy(value, &flipped, sizeof(flipped));
903                 int size = sizeof(value);
904                 //if (ctrls[req].request == 215) size = 0;
905                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
906                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
907                 if (rc < 0) {
908                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
909                         exit(1);
910                 }
911                 
912                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
913                 for (int i = 0; i < rc; ++i) {
914                         printf("%02x", value[i]);
915                 }
916                 printf("\n");
917         }
918
919 #if 0
920         // DEBUG
921         for ( ;; ) {
922                 static int my_index = 0;
923                 static uint8_t value[4];
924                 int size = sizeof(value);
925                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
926                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
927                 if (rc < 0) {
928                         fprintf(stderr, "Error on control\n");
929                         exit(1);
930                 }
931                 printf("rc=%d index=%d: 0x", rc, my_index);
932                 for (int i = 0; i < rc; ++i) {
933                         printf("%02x", value[i]);
934                 }
935                 printf("\n");
936         }
937 #endif
938
939 #if 0
940         // set up an asynchronous transfer of the timer register
941         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
942         static int completed = 0;
943
944         xfr = libusb_alloc_transfer(0);
945         libusb_fill_control_setup(cmdbuf,
946             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
947                 /*index=*/44, /*length=*/4);
948         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
949         xfr->user_data = this;
950         libusb_submit_transfer(xfr);
951
952         // set up an asynchronous transfer of register 24
953         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
954         static int completed2 = 0;
955
956         xfr = libusb_alloc_transfer(0);
957         libusb_fill_control_setup(cmdbuf2,
958             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
959                 /*index=*/24, /*length=*/4);
960         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
961         xfr->user_data = this;
962         libusb_submit_transfer(xfr);
963 #endif
964
965         // set up an asynchronous transfer of the register dump
966         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
967         static int completed3 = 0;
968
969         xfr = libusb_alloc_transfer(0);
970         libusb_fill_control_setup(cmdbuf3,
971             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
972                 /*index=*/current_register, /*length=*/4);
973         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
974         xfr->user_data = this;
975         //libusb_submit_transfer(xfr);
976
977         audiofp = fopen("audio.raw", "wb");
978
979         // set up isochronous transfers for audio and video
980         for (int e = 3; e <= 4; ++e) {
981                 //int num_transfers = (e == 3) ? 6 : 6;
982                 int num_transfers = 10;
983                 for (int i = 0; i < num_transfers; ++i) {
984                         int num_iso_pack, size;
985                         if (e == 3) {
986                                 // Video seems to require isochronous packets scaled with the width; 
987                                 // seemingly six lines is about right, rounded up to the required 1kB
988                                 // multiple.
989                                 size = WIDTH * 2 * 6;
990                                 // Note that for 10-bit input, you'll need to increase size accordingly.
991                                 //size = size * 4 / 3;
992                                 if (size % 1024 != 0) {
993                                         size &= ~1023;
994                                         size += 1024;
995                                 }
996                                 num_iso_pack = (2 << 16) / size;  // 128 kB.
997                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
998                         } else {
999                                 size = 0xc0;
1000                                 num_iso_pack = 80;
1001                         }
1002                         int num_bytes = num_iso_pack * size;
1003                         uint8_t *buf = new uint8_t[num_bytes];
1004
1005                         xfr = libusb_alloc_transfer(num_iso_pack);
1006                         if (!xfr) {
1007                                 fprintf(stderr, "oom\n");
1008                                 exit(1);
1009                         }
1010
1011                         int ep = LIBUSB_ENDPOINT_IN | e;
1012                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
1013                                 num_iso_pack, cb_xfr, nullptr, 0);
1014                         libusb_set_iso_packet_lengths(xfr, size);
1015                         xfr->user_data = this;
1016                         iso_xfrs.push_back(xfr);
1017                 }
1018         }
1019 }
1020
1021 void BMUSBCapture::start_bm_capture()
1022 {
1023         printf("starting capture\n");
1024         int i = 0;
1025         for (libusb_transfer *xfr : iso_xfrs) {
1026                 printf("submitting transfer...\n");
1027                 int rc = libusb_submit_transfer(xfr);
1028                 ++i;
1029                 if (rc < 0) {
1030                         //printf("num_bytes=%d\n", num_bytes);
1031                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1032                                 xfr->endpoint, i, libusb_error_name(rc));
1033                         exit(1);
1034                 }
1035         }
1036
1037
1038 #if 0
1039         libusb_release_interface(devh, 0);
1040 out:
1041         if (devh)
1042                 libusb_close(devh);
1043         libusb_exit(nullptr);
1044         return rc;
1045 #endif
1046 }
1047
1048 void BMUSBCapture::stop_dequeue_thread()
1049 {
1050         dequeue_thread_should_quit = true;
1051         queues_not_empty.notify_all();
1052         dequeue_thread.join();
1053 }
1054
1055 void BMUSBCapture::start_bm_thread()
1056 {
1057         should_quit = false;
1058         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1059 }
1060
1061 void BMUSBCapture::stop_bm_thread()
1062 {
1063         should_quit = true;
1064         usb_thread.join();
1065 }