]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Fix a crash during shutdown.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 // Audio is more important than video, and also much cheaper.
56 // By having many more audio frames available, hopefully if something
57 // starts to drop, we'll have CPU load go down (from not having to
58 // process as much video) before we have to drop audio.
59 #define NUM_QUEUED_VIDEO_FRAMES 16
60 #define NUM_QUEUED_AUDIO_FRAMES 64
61
62 class MallocFrameAllocator : public FrameAllocator {
63 public:
64         MallocFrameAllocator(size_t frame_size, size_t num_queued_frames);
65         Frame alloc_frame() override;
66         void release_frame(Frame frame) override;
67
68 private:
69         size_t frame_size;
70
71         mutex freelist_mutex;
72         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
73 };
74
75 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
76         : frame_size(frame_size)
77 {
78         for (size_t i = 0; i < num_queued_frames; ++i) {
79                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
80         }
81 }
82
83 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
84 {
85         Frame vf;
86         vf.owner = this;
87
88         unique_lock<mutex> lock(freelist_mutex);  // Meh.
89         if (freelist.empty()) {
90                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
91                         frame_size);
92         } else {
93                 vf.data = freelist.top().release();
94                 vf.size = frame_size;
95                 freelist.pop();  // Meh.
96         }
97         return vf;
98 }
99
100 void MallocFrameAllocator::release_frame(Frame frame)
101 {
102         if (frame.overflow > 0) {
103                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
104         }
105         unique_lock<mutex> lock(freelist_mutex);
106         freelist.push(unique_ptr<uint8_t[]>(frame.data));
107 }
108
109 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
110 {
111         if (a == b) {
112                 return false;
113         } else if (a < b) {
114                 return (b - a < 0x8000);
115         } else {
116                 int wrap_b = 0x10000 + int(b);
117                 return (wrap_b - a < 0x8000);
118         }
119 }
120
121 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
122 {
123         unique_lock<mutex> lock(queue_lock);
124         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
125                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
126                         q->back().timecode, timecode);
127                 frame.owner->release_frame(frame);
128                 return;
129         }
130
131         QueuedFrame qf;
132         qf.format = format;
133         qf.timecode = timecode;
134         qf.frame = frame;
135         q->push_back(move(qf));
136         queues_not_empty.notify_one();  // might be spurious
137 }
138
139 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
140 {
141         FILE *fp = fopen(filename, "wb");
142         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
143                 printf("short write!\n");
144         }
145         fclose(fp);
146 }
147
148 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
149 {
150         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
151 }
152
153 void BMUSBCapture::dequeue_thread_func()
154 {
155         if (has_dequeue_callbacks) {
156                 dequeue_init_callback();
157         }
158         while (!dequeue_thread_should_quit) {
159                 unique_lock<mutex> lock(queue_lock);
160                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
161
162                 if (dequeue_thread_should_quit) break;
163
164                 uint16_t video_timecode = pending_video_frames.front().timecode;
165                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
166                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
167                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
168                                 video_timecode);
169                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
170                         pending_video_frames.pop_front();
171                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
172                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
173                                 audio_timecode);
174                         QueuedFrame audio_frame = pending_audio_frames.front();
175                         pending_audio_frames.pop_front();
176                         lock.unlock();
177                         frame_callback(audio_timecode,
178                                        FrameAllocator::Frame(), 0, 0x0000,
179                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
180                 } else {
181                         QueuedFrame video_frame = pending_video_frames.front();
182                         QueuedFrame audio_frame = pending_audio_frames.front();
183                         pending_audio_frames.pop_front();
184                         pending_video_frames.pop_front();
185                         lock.unlock();
186
187 #if 0
188                         char filename[255];
189                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
190                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
191                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
192 #endif
193
194                         frame_callback(video_timecode,
195                                        video_frame.frame, HEADER_SIZE, video_frame.format,
196                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
197                 }
198         }
199         if (has_dequeue_callbacks) {
200                 dequeue_cleanup_callback();
201         }
202 }
203
204 void BMUSBCapture::start_new_frame(const uint8_t *start)
205 {
206         uint16_t format = (start[3] << 8) | start[2];
207         uint16_t timecode = (start[1] << 8) | start[0];
208
209         if (current_video_frame.len > 0) {
210                 // If format is 0x0800 (no signal), add a fake (empty) audio
211                 // frame to get it out of the queue.
212                 // TODO: Figure out if there are other formats that come with
213                 // no audio, and treat them the same.
214                 if (format == 0x0800) {
215                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
216                         if (fake_audio_frame.data == nullptr) {
217                                 // Oh well, it's just a no-signal frame anyway.
218                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
219                                 current_video_frame.owner->release_frame(current_video_frame);
220                                 current_video_frame = video_frame_allocator->alloc_frame();
221                                 return;
222                         }
223                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
224                 }
225                 //dump_frame();
226                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
227         }
228         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
229         //      format, timecode,
230         //      //start[7], start[6], start[5], start[4],
231         //      read_current_frame, FRAME_SIZE);
232
233         current_video_frame = video_frame_allocator->alloc_frame();
234         //if (current_video_frame.data == nullptr) {
235         //      read_current_frame = -1;
236         //} else {
237         //      read_current_frame = 0;
238         //}
239 }
240
241 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
242 {
243         uint16_t format = (start[3] << 8) | start[2];
244         uint16_t timecode = (start[1] << 8) | start[0];
245         if (current_audio_frame.len > 0) {
246                 //dump_audio_block();
247                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
248         }
249         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
250         //      format, timecode, read_current_audio_block);
251         current_audio_frame = audio_frame_allocator->alloc_frame();
252 }
253
254 #if 0
255 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
256 {
257         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
258         for (unsigned j = 0; j < pack->actual_length; j++) {
259         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
260                 printf("%02x", xfr->buffer[j + offset]);
261                 if ((j % 16) == 15)
262                         printf("\n");
263                 else if ((j % 8) == 7)
264                         printf("  ");
265                 else
266                         printf(" ");
267         }
268 }
269 #endif
270
271 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
272 {
273         assert(n % 2 == 0);
274         uint8_t *dptr1 = dest1;
275         uint8_t *dptr2 = dest2;
276
277         for (size_t i = 0; i < n; i += 2) {
278                 *dptr1++ = *src++;
279                 *dptr2++ = *src++;
280         }
281 }
282
283 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
284 {
285         if (current_frame->data == nullptr ||
286             current_frame->len > current_frame->size ||
287             start == end) {
288                 return;
289         }
290
291         int bytes = end - start;
292         if (current_frame->len + bytes > current_frame->size) {
293                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
294                 current_frame->len = current_frame->size;
295                 if (current_frame->overflow > 1048576) {
296                         printf("%d bytes overflow after last %s frame\n",
297                                 int(current_frame->overflow), frame_type_name);
298                         current_frame->overflow = 0;
299                 }
300                 //dump_frame();
301         } else {
302                 if (current_frame->interleaved) {
303                         uint8_t *data = current_frame->data + current_frame->len / 2;
304                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
305                         if (current_frame->len % 2 == 1) {
306                                 ++data;
307                                 swap(data, data2);
308                         }
309                         if (bytes % 2 == 1) {
310                                 *data++ = *start++;
311                                 swap(data, data2);
312                                 ++current_frame->len;
313                                 --bytes;
314                         }
315                         memcpy_interleaved(data, data2, start, bytes);
316                         current_frame->len += bytes;
317                 } else {
318                         memcpy(current_frame->data + current_frame->len, start, bytes);
319                         current_frame->len += bytes;
320                 }
321         }
322 }
323
324 #ifdef __SSE4_1__
325
326 #if 0
327 void avx2_dump(const char *name, __m256i n)
328 {
329         printf("%-10s:", name);
330         printf(" %02x", _mm256_extract_epi8(n, 0));
331         printf(" %02x", _mm256_extract_epi8(n, 1));
332         printf(" %02x", _mm256_extract_epi8(n, 2));
333         printf(" %02x", _mm256_extract_epi8(n, 3));
334         printf(" %02x", _mm256_extract_epi8(n, 4));
335         printf(" %02x", _mm256_extract_epi8(n, 5));
336         printf(" %02x", _mm256_extract_epi8(n, 6));
337         printf(" %02x", _mm256_extract_epi8(n, 7));
338         printf(" ");
339         printf(" %02x", _mm256_extract_epi8(n, 8));
340         printf(" %02x", _mm256_extract_epi8(n, 9));
341         printf(" %02x", _mm256_extract_epi8(n, 10));
342         printf(" %02x", _mm256_extract_epi8(n, 11));
343         printf(" %02x", _mm256_extract_epi8(n, 12));
344         printf(" %02x", _mm256_extract_epi8(n, 13));
345         printf(" %02x", _mm256_extract_epi8(n, 14));
346         printf(" %02x", _mm256_extract_epi8(n, 15));
347         printf(" ");
348         printf(" %02x", _mm256_extract_epi8(n, 16));
349         printf(" %02x", _mm256_extract_epi8(n, 17));
350         printf(" %02x", _mm256_extract_epi8(n, 18));
351         printf(" %02x", _mm256_extract_epi8(n, 19));
352         printf(" %02x", _mm256_extract_epi8(n, 20));
353         printf(" %02x", _mm256_extract_epi8(n, 21));
354         printf(" %02x", _mm256_extract_epi8(n, 22));
355         printf(" %02x", _mm256_extract_epi8(n, 23));
356         printf(" ");
357         printf(" %02x", _mm256_extract_epi8(n, 24));
358         printf(" %02x", _mm256_extract_epi8(n, 25));
359         printf(" %02x", _mm256_extract_epi8(n, 26));
360         printf(" %02x", _mm256_extract_epi8(n, 27));
361         printf(" %02x", _mm256_extract_epi8(n, 28));
362         printf(" %02x", _mm256_extract_epi8(n, 29));
363         printf(" %02x", _mm256_extract_epi8(n, 30));
364         printf(" %02x", _mm256_extract_epi8(n, 31));
365         printf("\n");
366 }
367 #endif
368
369 // Does a memcpy and memchr in one to reduce processing time.
370 // Note that the benefit is somewhat limited if your L3 cache is small,
371 // as you'll (unfortunately) spend most of the time loading the data
372 // from main memory.
373 //
374 // Complicated cases are left to the slow path; it basically stops copying
375 // up until the first instance of "sync_char" (usually a bit before, actually).
376 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
377 // data, and what we really need this for is the 00 00 ff ff marker in video data.
378 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
379 {
380         if (current_frame->data == nullptr ||
381             current_frame->len > current_frame->size ||
382             start == limit) {
383                 return start;
384         }
385         size_t orig_bytes = limit - start;
386         if (orig_bytes < 128) {
387                 // Don't bother.
388                 return start;
389         }
390
391         // Don't read more bytes than we can write.
392         limit = min(limit, start + (current_frame->size - current_frame->len));
393
394         // Align end to 32 bytes.
395         limit = (const uint8_t *)(intptr_t(limit) & ~31);
396
397         if (start >= limit) {
398                 return start;
399         }
400
401         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
402         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
403         if (aligned_start != start) {
404                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
405                 if (sync_start == nullptr) {
406                         add_to_frame(current_frame, "", start, aligned_start);
407                 } else {
408                         add_to_frame(current_frame, "", start, sync_start);
409                         return sync_start;
410                 }
411         }
412
413         // Make the length a multiple of 64.
414         if (current_frame->interleaved) {
415                 if (((limit - aligned_start) % 64) != 0) {
416                         limit -= 32;
417                 }
418                 assert(((limit - aligned_start) % 64) == 0);
419         }
420
421 #if __AVX2__
422         const __m256i needle = _mm256_set1_epi8(sync_char);
423
424         const __restrict __m256i *in = (const __m256i *)aligned_start;
425         if (current_frame->interleaved) {
426                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
427                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
428                 if (current_frame->len % 2 == 1) {
429                         swap(out1, out2);
430                 }
431
432                 __m256i shuffle_cw = _mm256_set_epi8(
433                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
434                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
435                 while (in < (const __m256i *)limit) {
436                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
437                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
438                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
439
440                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
441                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
442                         __m256i found = _mm256_or_si256(found1, found2);
443
444                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
445                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
446                 
447                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
448                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
449
450                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
451                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
452
453                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
454                         _mm256_storeu_si256(out2, hi);
455
456                         if (!_mm256_testz_si256(found, found)) {
457                                 break;
458                         }
459
460                         in += 2;
461                         ++out1;
462                         ++out2;
463                 }
464                 current_frame->len += (uint8_t *)in - aligned_start;
465         } else {
466                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
467                 while (in < (const __m256i *)limit) {
468                         __m256i data = _mm256_load_si256(in);
469                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
470                         __m256i found = _mm256_cmpeq_epi8(data, needle);
471                         if (!_mm256_testz_si256(found, found)) {
472                                 break;
473                         }
474
475                         ++in;
476                         ++out;
477                 }
478                 current_frame->len = (uint8_t *)out - current_frame->data;
479         }
480 #else
481         const __m128i needle = _mm_set1_epi8(sync_char);
482
483         const __m128i *in = (const __m128i *)aligned_start;
484         if (current_frame->interleaved) {
485                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
486                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
487                 if (current_frame->len % 2 == 1) {
488                         swap(out1, out2);
489                 }
490
491                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
492                 while (in < (const __m128i *)limit) {
493                         __m128i data1 = _mm_load_si128(in);
494                         __m128i data2 = _mm_load_si128(in + 1);
495                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
496                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
497                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
498                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
499                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
500                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
501                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
502                         _mm_storeu_si128(out2, hi);
503                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
504                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
505                         if (!_mm_testz_si128(found1, found1) ||
506                             !_mm_testz_si128(found2, found2)) {
507                                 break;
508                         }
509
510                         in += 2;
511                         ++out1;
512                         ++out2;
513                 }
514                 current_frame->len += (uint8_t *)in - aligned_start;
515         } else {
516                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
517                 while (in < (const __m128i *)limit) {
518                         __m128i data = _mm_load_si128(in);
519                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
520                         __m128i found = _mm_cmpeq_epi8(data, needle);
521                         if (!_mm_testz_si128(found, found)) {
522                                 break;
523                         }
524
525                         ++in;
526                         ++out;
527                 }
528                 current_frame->len = (uint8_t *)out - current_frame->data;
529         }
530 #endif
531
532         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
533
534         return (const uint8_t *)in;
535 }
536 #endif
537
538 void decode_packs(const libusb_transfer *xfr,
539                   const char *sync_pattern,
540                   int sync_length,
541                   FrameAllocator::Frame *current_frame,
542                   const char *frame_type_name,
543                   function<void(const uint8_t *start)> start_callback)
544 {
545         int offset = 0;
546         for (int i = 0; i < xfr->num_iso_packets; i++) {
547                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
548
549                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
550                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
551                         continue;
552 //exit(5);
553                 }
554
555                 const uint8_t *start = xfr->buffer + offset;
556                 const uint8_t *limit = start + pack->actual_length;
557                 while (start < limit) {  // Usually runs only one iteration.
558 #ifdef __SSE4_1__
559                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
560                         if (start == limit) break;
561                         assert(start < limit);
562 #endif
563
564                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
565                         if (start_next_frame == nullptr) {
566                                 // add the rest of the buffer
567                                 add_to_frame(current_frame, frame_type_name, start, limit);
568                                 break;
569                         } else {
570                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
571                                 start = start_next_frame + sync_length;  // skip sync
572                                 start_callback(start);
573                         }
574                 }
575 #if 0
576                 dump_pack(xfr, offset, pack);
577 #endif
578                 offset += pack->length;
579         }
580 }
581
582 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
583 {
584         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
585                 fprintf(stderr, "transfer status %d\n", xfr->status);
586                 libusb_free_transfer(xfr);
587                 exit(3);
588         }
589
590         assert(xfr->user_data != nullptr);
591         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
592
593         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
594                 if (xfr->endpoint == 0x84) {
595                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
596                 } else {
597                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
598                 }
599         }
600         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
601                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
602                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
603 #if 0
604                 if (setup->wIndex == 44) {
605                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
606                 } else {
607                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
608                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
609                 }
610 #else
611                 memcpy(usb->register_file + usb->current_register, buf, 4);
612                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
613                 if (usb->current_register == 0) {
614                         // read through all of them
615                         printf("register dump:");
616                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
617                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
618                         }
619                         printf("\n");
620                 }
621                 libusb_fill_control_setup(xfr->buffer,
622                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
623                         /*index=*/usb->current_register, /*length=*/4);
624 #endif
625         }
626
627 #if 0
628         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
629         for (i = 0; i < xfr->actual_length; i++) {
630                 printf("%02x", xfr->buffer[i]);
631                 if (i % 16)
632                         printf("\n");
633                 else if (i % 8)
634                         printf("  ");
635                 else
636                         printf(" ");
637         }
638 #endif
639
640         int rc = libusb_submit_transfer(xfr);
641         if (rc < 0) {
642                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
643                 exit(1);
644         }
645 }
646
647 void BMUSBCapture::usb_thread_func()
648 {
649         sched_param param;
650         memset(&param, 0, sizeof(param));
651         param.sched_priority = 1;
652         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
653                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
654         }
655         while (!should_quit) {
656                 int rc = libusb_handle_events(nullptr);
657                 if (rc != LIBUSB_SUCCESS)
658                         break;
659         }
660 }
661
662 struct USBCardDevice {
663         uint16_t product;
664         uint8_t bus, port;
665         libusb_device *device;
666 };
667
668 libusb_device_handle *open_card(int card_index)
669 {       
670         libusb_device **devices;
671         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
672         if (num_devices == -1) {
673                 fprintf(stderr, "Error finding USB devices\n");
674                 exit(1);
675         }
676         vector<USBCardDevice> found_cards;
677         for (ssize_t i = 0; i < num_devices; ++i) {
678                 libusb_device_descriptor desc;
679                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
680                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
681                         exit(1);
682                 }
683
684                 uint8_t bus = libusb_get_bus_number(devices[i]);
685                 uint8_t port = libusb_get_port_number(devices[i]);
686
687                 if (!(desc.idVendor == 0x1edb && desc.idProduct == 0xbd3b) &&
688                     !(desc.idVendor == 0x1edb && desc.idProduct == 0xbd4f)) {
689                         libusb_unref_device(devices[i]);
690                         continue;
691                 }
692
693                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
694         }
695         libusb_free_device_list(devices, 0);
696
697         // Sort the devices to get a consistent ordering.
698         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
699                 if (a.product != b.product)
700                         return a.product < b.product;
701                 if (a.bus != b.bus)
702                         return a.bus < b.bus;
703                 return a.port < b.port;
704         });
705
706         for (size_t i = 0; i < found_cards.size(); ++i) {
707                 fprintf(stderr, "Card %d: Bus %03u Device %03u ", int(i), found_cards[i].bus, found_cards[i].port);
708                 if (found_cards[i].product == 0xbd3b) {
709                         fprintf(stderr, "Intensity Shuttle\n");
710                 } else if (found_cards[i].product == 0xbd4f) {
711                         fprintf(stderr, "UltraStudio SDI\n");
712                 } else {
713                         assert(false);
714                 }
715         }
716
717         if (size_t(card_index) >= found_cards.size()) {
718                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
719                 exit(1);
720         }
721
722         libusb_device_handle *devh;
723         int rc = libusb_open(found_cards[card_index].device, &devh);
724         if (rc < 0) {
725                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
726                 exit(1);
727         }
728
729         for (size_t i = 0; i < found_cards.size(); ++i) {
730                 libusb_unref_device(found_cards[i].device);
731         }
732
733         return devh;
734 }
735
736 void BMUSBCapture::configure_card()
737 {
738         if (video_frame_allocator == nullptr) {
739                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));  // FIXME: leak.
740         }
741         if (audio_frame_allocator == nullptr) {
742                 set_audio_frame_allocator(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));  // FIXME: leak.
743         }
744         dequeue_thread_should_quit = false;
745         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
746
747         int rc;
748         struct libusb_transfer *xfr;
749
750         rc = libusb_init(nullptr);
751         if (rc < 0) {
752                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
753                 exit(1);
754         }
755
756         libusb_device_handle *devh = open_card(card_index);
757         if (!devh) {
758                 fprintf(stderr, "Error finding USB device\n");
759                 exit(1);
760         }
761
762         libusb_config_descriptor *config;
763         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
764         if (rc < 0) {
765                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
766                 exit(1);
767         }
768         printf("%d interface\n", config->bNumInterfaces);
769         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
770                 printf("  interface %d\n", interface_number);
771                 const libusb_interface *interface = &config->interface[interface_number];
772                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
773                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
774                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
775                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
776                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
777                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
778                         }
779                 }
780         }
781
782         rc = libusb_set_configuration(devh, /*configuration=*/1);
783         if (rc < 0) {
784                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
785                 exit(1);
786         }
787
788         rc = libusb_claim_interface(devh, 0);
789         if (rc < 0) {
790                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
791                 exit(1);
792         }
793
794         // Alternate setting 1 is output, alternate setting 2 is input.
795         // Card is reset when switching alternates, so the driver uses
796         // this “double switch” when it wants to reset.
797         //
798         // There's also alternate settings 3 and 4, which seem to be
799         // like 1 and 2 except they advertise less bandwidth needed.
800         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
801         if (rc < 0) {
802                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
803                 exit(1);
804         }
805         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
806         if (rc < 0) {
807                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
808                 exit(1);
809         }
810 #if 0
811         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
812         if (rc < 0) {
813                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
814                 exit(1);
815         }
816 #endif
817
818 #if 0
819         rc = libusb_claim_interface(devh, 3);
820         if (rc < 0) {
821                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
822                 exit(1);
823         }
824 #endif
825
826         // theories:
827         //   44 is some kind of timer register (first 16 bits count upwards)
828         //   24 is some sort of watchdog?
829         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
830         //      (or will go to 0x73c60010?), also seen 0x73c60100
831         //   12 also changes all the time, unclear why  
832         //   16 seems to be autodetected mode somehow
833         //      --    this is e00115e0 after reset?
834         //                    ed0115e0 after mode change [to output?]
835         //                    2d0015e0 after more mode change [to input]
836         //                    ed0115e0 after more mode change
837         //                    2d0015e0 after more mode change
838         //
839         //                    390115e0 seems to indicate we have signal
840         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
841         //
842         //                    200015e0 on startup
843         //         changes to 250115e0 when we sync to the signal
844         //
845         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
846         //
847         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
848         //
849         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
850         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
851         //
852         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
853         //    perhaps some of them are related to analog output?
854         //
855         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
856         //    but the driver sets it to 0x8036802a at some point.
857         //
858         //    all of this is on request 214/215. other requests (192, 219,
859         //    222, 223, 224) are used for firmware upgrade. Probably best to
860         //    stay out of it unless you know what you're doing.
861         //
862         //
863         // register 16:
864         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
865         //
866         // theories:
867         //   0x01 - stable signal
868         //   0x04 - deep color
869         //   0x08 - unknown (audio??)
870         //   0x20 - 720p??
871         //   0x30 - 576p??
872
873         struct ctrl {
874                 int endpoint;
875                 int request;
876                 int index;
877                 uint32_t data;
878         };
879         static const ctrl ctrls[] = {
880                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
881                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
882
883                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
884                 // capture (v210).
885                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
886                 // 0x10000000 = analog audio instead of embedded audio, it seems
887                 // 0x3a000000 = component video? (analog audio)
888                 // 0x3c000000 = composite video? (analog audio)
889                 // 0x3e000000 = s-video? (analog audio)
890                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
891                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
892                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
893                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
894                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
895         };
896
897         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
898                 uint32_t flipped = htonl(ctrls[req].data);
899                 static uint8_t value[4];
900                 memcpy(value, &flipped, sizeof(flipped));
901                 int size = sizeof(value);
902                 //if (ctrls[req].request == 215) size = 0;
903                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
904                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
905                 if (rc < 0) {
906                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
907                         exit(1);
908                 }
909                 
910                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
911                 for (int i = 0; i < rc; ++i) {
912                         printf("%02x", value[i]);
913                 }
914                 printf("\n");
915         }
916
917 #if 0
918         // DEBUG
919         for ( ;; ) {
920                 static int my_index = 0;
921                 static uint8_t value[4];
922                 int size = sizeof(value);
923                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
924                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
925                 if (rc < 0) {
926                         fprintf(stderr, "Error on control\n");
927                         exit(1);
928                 }
929                 printf("rc=%d index=%d: 0x", rc, my_index);
930                 for (int i = 0; i < rc; ++i) {
931                         printf("%02x", value[i]);
932                 }
933                 printf("\n");
934         }
935 #endif
936
937 #if 0
938         // set up an asynchronous transfer of the timer register
939         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
940         static int completed = 0;
941
942         xfr = libusb_alloc_transfer(0);
943         libusb_fill_control_setup(cmdbuf,
944             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
945                 /*index=*/44, /*length=*/4);
946         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
947         xfr->user_data = this;
948         libusb_submit_transfer(xfr);
949
950         // set up an asynchronous transfer of register 24
951         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
952         static int completed2 = 0;
953
954         xfr = libusb_alloc_transfer(0);
955         libusb_fill_control_setup(cmdbuf2,
956             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
957                 /*index=*/24, /*length=*/4);
958         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
959         xfr->user_data = this;
960         libusb_submit_transfer(xfr);
961 #endif
962
963         // set up an asynchronous transfer of the register dump
964         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
965         static int completed3 = 0;
966
967         xfr = libusb_alloc_transfer(0);
968         libusb_fill_control_setup(cmdbuf3,
969             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
970                 /*index=*/current_register, /*length=*/4);
971         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
972         xfr->user_data = this;
973         //libusb_submit_transfer(xfr);
974
975         audiofp = fopen("audio.raw", "wb");
976
977         // set up isochronous transfers for audio and video
978         for (int e = 3; e <= 4; ++e) {
979                 //int num_transfers = (e == 3) ? 6 : 6;
980                 int num_transfers = 10;
981                 for (int i = 0; i < num_transfers; ++i) {
982                         int num_iso_pack, size;
983                         if (e == 3) {
984                                 // Video seems to require isochronous packets scaled with the width; 
985                                 // seemingly six lines is about right, rounded up to the required 1kB
986                                 // multiple.
987                                 size = WIDTH * 2 * 6;
988                                 // Note that for 10-bit input, you'll need to increase size accordingly.
989                                 //size = size * 4 / 3;
990                                 if (size % 1024 != 0) {
991                                         size &= ~1023;
992                                         size += 1024;
993                                 }
994                                 num_iso_pack = (2 << 16) / size;  // 128 kB.
995                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
996                         } else {
997                                 size = 0xc0;
998                                 num_iso_pack = 80;
999                         }
1000                         int num_bytes = num_iso_pack * size;
1001                         uint8_t *buf = new uint8_t[num_bytes];
1002
1003                         xfr = libusb_alloc_transfer(num_iso_pack);
1004                         if (!xfr) {
1005                                 fprintf(stderr, "oom\n");
1006                                 exit(1);
1007                         }
1008
1009                         int ep = LIBUSB_ENDPOINT_IN | e;
1010                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
1011                                 num_iso_pack, cb_xfr, nullptr, 0);
1012                         libusb_set_iso_packet_lengths(xfr, size);
1013                         xfr->user_data = this;
1014                         iso_xfrs.push_back(xfr);
1015                 }
1016         }
1017 }
1018
1019 void BMUSBCapture::start_bm_capture()
1020 {
1021         printf("starting capture\n");
1022         int i = 0;
1023         for (libusb_transfer *xfr : iso_xfrs) {
1024                 printf("submitting transfer...\n");
1025                 int rc = libusb_submit_transfer(xfr);
1026                 ++i;
1027                 if (rc < 0) {
1028                         //printf("num_bytes=%d\n", num_bytes);
1029                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1030                                 xfr->endpoint, i, libusb_error_name(rc));
1031                         exit(1);
1032                 }
1033         }
1034
1035
1036 #if 0
1037         libusb_release_interface(devh, 0);
1038 out:
1039         if (devh)
1040                 libusb_close(devh);
1041         libusb_exit(nullptr);
1042         return rc;
1043 #endif
1044 }
1045
1046 void BMUSBCapture::stop_dequeue_thread()
1047 {
1048         dequeue_thread_should_quit = true;
1049         queues_not_empty.notify_all();
1050         dequeue_thread.join();
1051 }
1052
1053 void BMUSBCapture::start_bm_thread()
1054 {
1055         should_quit = false;
1056         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1057 }
1058
1059 void BMUSBCapture::stop_bm_thread()
1060 {
1061         should_quit = true;
1062         usb_thread.join();
1063 }