]> git.sesse.net Git - bmusb/blob - bmusb.cpp
Fix a queue access not under a mutex.
[bmusb] / bmusb.cpp
1 // Intensity Shuttle USB3 prototype capture driver, v0.3
2 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
3 // (can do captures for hours at a time with no drops), except during startup
4 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
5 // Audio comes out as 8-channel 24-bit raw audio.
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <libusb.h>
10 #include <netinet/in.h>
11 #include <sched.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #ifdef __SSE4_1__
17 #include <immintrin.h>
18 #endif
19 #include "bmusb.h"
20
21 #include <algorithm>
22 #include <atomic>
23 #include <condition_variable>
24 #include <cstddef>
25 #include <cstdint>
26 #include <deque>
27 #include <functional>
28 #include <memory>
29 #include <mutex>
30 #include <stack>
31 #include <thread>
32
33 using namespace std;
34 using namespace std::placeholders;
35
36 #define WIDTH 1280
37 #define HEIGHT 750  /* 30 lines ancillary data? */
38 //#define WIDTH 1920
39 //#define HEIGHT 1125  /* ??? lines ancillary data? */
40 #define HEADER_SIZE 44
41 //#define HEADER_SIZE 0
42 #define AUDIO_HEADER_SIZE 4
43
44 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
45 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
46 #define FRAME_SIZE (8 << 20)
47
48 FILE *audiofp;
49
50 thread usb_thread;
51 atomic<bool> should_quit;
52
53 FrameAllocator::~FrameAllocator() {}
54
55 // Audio is more important than video, and also much cheaper.
56 // By having many more audio frames available, hopefully if something
57 // starts to drop, we'll have CPU load go down (from not having to
58 // process as much video) before we have to drop audio.
59 #define NUM_QUEUED_VIDEO_FRAMES 16
60 #define NUM_QUEUED_AUDIO_FRAMES 64
61
62 class MallocFrameAllocator : public FrameAllocator {
63 public:
64         MallocFrameAllocator(size_t frame_size, size_t num_queued_frames);
65         Frame alloc_frame() override;
66         void release_frame(Frame frame) override;
67
68 private:
69         size_t frame_size;
70
71         mutex freelist_mutex;
72         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
73 };
74
75 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size, size_t num_queued_frames)
76         : frame_size(frame_size)
77 {
78         for (size_t i = 0; i < num_queued_frames; ++i) {
79                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
80         }
81 }
82
83 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
84 {
85         Frame vf;
86         vf.owner = this;
87
88         unique_lock<mutex> lock(freelist_mutex);  // Meh.
89         if (freelist.empty()) {
90                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
91                         frame_size);
92         } else {
93                 vf.data = freelist.top().release();
94                 vf.size = frame_size;
95                 freelist.pop();  // Meh.
96         }
97         return vf;
98 }
99
100 void MallocFrameAllocator::release_frame(Frame frame)
101 {
102         if (frame.overflow > 0) {
103                 printf("%d bytes overflow after last (malloc) frame\n", int(frame.overflow));
104         }
105         unique_lock<mutex> lock(freelist_mutex);
106         freelist.push(unique_ptr<uint8_t[]>(frame.data));
107 }
108
109 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
110 {
111         if (a == b) {
112                 return false;
113         } else if (a < b) {
114                 return (b - a < 0x8000);
115         } else {
116                 int wrap_b = 0x10000 + int(b);
117                 return (wrap_b - a < 0x8000);
118         }
119 }
120
121 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
122 {
123         unique_lock<mutex> lock(queue_lock);
124         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
125                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
126                         q->back().timecode, timecode);
127                 frame.owner->release_frame(frame);
128                 return;
129         }
130
131         QueuedFrame qf;
132         qf.format = format;
133         qf.timecode = timecode;
134         qf.frame = frame;
135         q->push_back(move(qf));
136         queues_not_empty.notify_one();  // might be spurious
137 }
138
139 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
140 {
141         FILE *fp = fopen(filename, "wb");
142         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
143                 printf("short write!\n");
144         }
145         fclose(fp);
146 }
147
148 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
149 {
150         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
151 }
152
153 void BMUSBCapture::dequeue_thread_func()
154 {
155         if (has_dequeue_callbacks) {
156                 dequeue_init_callback();
157         }
158         while (!dequeue_thread_should_quit) {
159                 unique_lock<mutex> lock(queue_lock);
160                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
161
162                 uint16_t video_timecode = pending_video_frames.front().timecode;
163                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
164                 if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
165                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
166                                 video_timecode);
167                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
168                         pending_video_frames.pop_front();
169                 } else if (uint16_less_than_with_wraparound(audio_timecode, video_timecode)) {
170                         printf("Audio block 0x%04x without corresponding video block, sending blank frame.\n",
171                                 audio_timecode);
172                         QueuedFrame audio_frame = pending_audio_frames.front();
173                         pending_audio_frames.pop_front();
174                         lock.unlock();
175                         frame_callback(audio_timecode,
176                                        FrameAllocator::Frame(), 0, 0x0000,
177                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
178                 } else {
179                         QueuedFrame video_frame = pending_video_frames.front();
180                         QueuedFrame audio_frame = pending_audio_frames.front();
181                         pending_audio_frames.pop_front();
182                         pending_video_frames.pop_front();
183                         lock.unlock();
184
185 #if 0
186                         char filename[255];
187                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
188                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
189                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
190 #endif
191
192                         frame_callback(video_timecode,
193                                        video_frame.frame, HEADER_SIZE, video_frame.format,
194                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
195                 }
196         }
197         if (has_dequeue_callbacks) {
198                 dequeue_cleanup_callback();
199         }
200 }
201
202 void BMUSBCapture::start_new_frame(const uint8_t *start)
203 {
204         uint16_t format = (start[3] << 8) | start[2];
205         uint16_t timecode = (start[1] << 8) | start[0];
206
207         if (current_video_frame.len > 0) {
208                 // If format is 0x0800 (no signal), add a fake (empty) audio
209                 // frame to get it out of the queue.
210                 // TODO: Figure out if there are other formats that come with
211                 // no audio, and treat them the same.
212                 if (format == 0x0800) {
213                         FrameAllocator::Frame fake_audio_frame = audio_frame_allocator->alloc_frame();
214                         if (fake_audio_frame.data == nullptr) {
215                                 // Oh well, it's just a no-signal frame anyway.
216                                 printf("Couldn't allocate fake audio frame, also dropping no-signal video frame.\n");
217                                 current_video_frame.owner->release_frame(current_video_frame);
218                                 current_video_frame = video_frame_allocator->alloc_frame();
219                                 return;
220                         }
221                         queue_frame(format, timecode, fake_audio_frame, &pending_audio_frames);
222                 }
223                 //dump_frame();
224                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
225         }
226         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
227         //      format, timecode,
228         //      //start[7], start[6], start[5], start[4],
229         //      read_current_frame, FRAME_SIZE);
230
231         current_video_frame = video_frame_allocator->alloc_frame();
232         //if (current_video_frame.data == nullptr) {
233         //      read_current_frame = -1;
234         //} else {
235         //      read_current_frame = 0;
236         //}
237 }
238
239 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
240 {
241         uint16_t format = (start[3] << 8) | start[2];
242         uint16_t timecode = (start[1] << 8) | start[0];
243         if (current_audio_frame.len > 0) {
244                 //dump_audio_block();
245                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
246         }
247         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
248         //      format, timecode, read_current_audio_block);
249         current_audio_frame = audio_frame_allocator->alloc_frame();
250 }
251
252 #if 0
253 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
254 {
255         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
256         for (unsigned j = 0; j < pack->actual_length; j++) {
257         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
258                 printf("%02x", xfr->buffer[j + offset]);
259                 if ((j % 16) == 15)
260                         printf("\n");
261                 else if ((j % 8) == 7)
262                         printf("  ");
263                 else
264                         printf(" ");
265         }
266 }
267 #endif
268
269 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
270 {
271         assert(n % 2 == 0);
272         uint8_t *dptr1 = dest1;
273         uint8_t *dptr2 = dest2;
274
275         for (size_t i = 0; i < n; i += 2) {
276                 *dptr1++ = *src++;
277                 *dptr2++ = *src++;
278         }
279 }
280
281 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
282 {
283         if (current_frame->data == nullptr ||
284             current_frame->len > current_frame->size ||
285             start == end) {
286                 return;
287         }
288
289         int bytes = end - start;
290         if (current_frame->len + bytes > current_frame->size) {
291                 current_frame->overflow = current_frame->len + bytes - current_frame->size;
292                 current_frame->len = current_frame->size;
293                 if (current_frame->overflow > 1048576) {
294                         printf("%d bytes overflow after last %s frame\n",
295                                 int(current_frame->overflow), frame_type_name);
296                         current_frame->overflow = 0;
297                 }
298                 //dump_frame();
299         } else {
300                 if (current_frame->interleaved) {
301                         uint8_t *data = current_frame->data + current_frame->len / 2;
302                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
303                         if (current_frame->len % 2 == 1) {
304                                 ++data;
305                                 swap(data, data2);
306                         }
307                         if (bytes % 2 == 1) {
308                                 *data++ = *start++;
309                                 swap(data, data2);
310                                 ++current_frame->len;
311                                 --bytes;
312                         }
313                         memcpy_interleaved(data, data2, start, bytes);
314                         current_frame->len += bytes;
315                 } else {
316                         memcpy(current_frame->data + current_frame->len, start, bytes);
317                         current_frame->len += bytes;
318                 }
319         }
320 }
321
322 #ifdef __SSE4_1__
323
324 #if 0
325 void avx2_dump(const char *name, __m256i n)
326 {
327         printf("%-10s:", name);
328         printf(" %02x", _mm256_extract_epi8(n, 0));
329         printf(" %02x", _mm256_extract_epi8(n, 1));
330         printf(" %02x", _mm256_extract_epi8(n, 2));
331         printf(" %02x", _mm256_extract_epi8(n, 3));
332         printf(" %02x", _mm256_extract_epi8(n, 4));
333         printf(" %02x", _mm256_extract_epi8(n, 5));
334         printf(" %02x", _mm256_extract_epi8(n, 6));
335         printf(" %02x", _mm256_extract_epi8(n, 7));
336         printf(" ");
337         printf(" %02x", _mm256_extract_epi8(n, 8));
338         printf(" %02x", _mm256_extract_epi8(n, 9));
339         printf(" %02x", _mm256_extract_epi8(n, 10));
340         printf(" %02x", _mm256_extract_epi8(n, 11));
341         printf(" %02x", _mm256_extract_epi8(n, 12));
342         printf(" %02x", _mm256_extract_epi8(n, 13));
343         printf(" %02x", _mm256_extract_epi8(n, 14));
344         printf(" %02x", _mm256_extract_epi8(n, 15));
345         printf(" ");
346         printf(" %02x", _mm256_extract_epi8(n, 16));
347         printf(" %02x", _mm256_extract_epi8(n, 17));
348         printf(" %02x", _mm256_extract_epi8(n, 18));
349         printf(" %02x", _mm256_extract_epi8(n, 19));
350         printf(" %02x", _mm256_extract_epi8(n, 20));
351         printf(" %02x", _mm256_extract_epi8(n, 21));
352         printf(" %02x", _mm256_extract_epi8(n, 22));
353         printf(" %02x", _mm256_extract_epi8(n, 23));
354         printf(" ");
355         printf(" %02x", _mm256_extract_epi8(n, 24));
356         printf(" %02x", _mm256_extract_epi8(n, 25));
357         printf(" %02x", _mm256_extract_epi8(n, 26));
358         printf(" %02x", _mm256_extract_epi8(n, 27));
359         printf(" %02x", _mm256_extract_epi8(n, 28));
360         printf(" %02x", _mm256_extract_epi8(n, 29));
361         printf(" %02x", _mm256_extract_epi8(n, 30));
362         printf(" %02x", _mm256_extract_epi8(n, 31));
363         printf("\n");
364 }
365 #endif
366
367 // Does a memcpy and memchr in one to reduce processing time.
368 // Note that the benefit is somewhat limited if your L3 cache is small,
369 // as you'll (unfortunately) spend most of the time loading the data
370 // from main memory.
371 //
372 // Complicated cases are left to the slow path; it basically stops copying
373 // up until the first instance of "sync_char" (usually a bit before, actually).
374 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
375 // data, and what we really need this for is the 00 00 ff ff marker in video data.
376 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
377 {
378         if (current_frame->data == nullptr ||
379             current_frame->len > current_frame->size ||
380             start == limit) {
381                 return start;
382         }
383         size_t orig_bytes = limit - start;
384         if (orig_bytes < 128) {
385                 // Don't bother.
386                 return start;
387         }
388
389         // Don't read more bytes than we can write.
390         limit = min(limit, start + (current_frame->size - current_frame->len));
391
392         // Align end to 32 bytes.
393         limit = (const uint8_t *)(intptr_t(limit) & ~31);
394
395         if (start >= limit) {
396                 return start;
397         }
398
399         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
400         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
401         if (aligned_start != start) {
402                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
403                 if (sync_start == nullptr) {
404                         add_to_frame(current_frame, "", start, aligned_start);
405                 } else {
406                         add_to_frame(current_frame, "", start, sync_start);
407                         return sync_start;
408                 }
409         }
410
411         // Make the length a multiple of 64.
412         if (current_frame->interleaved) {
413                 if (((limit - aligned_start) % 64) != 0) {
414                         limit -= 32;
415                 }
416                 assert(((limit - aligned_start) % 64) == 0);
417         }
418
419 #if __AVX2__
420         const __m256i needle = _mm256_set1_epi8(sync_char);
421
422         const __restrict __m256i *in = (const __m256i *)aligned_start;
423         if (current_frame->interleaved) {
424                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
425                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
426                 if (current_frame->len % 2 == 1) {
427                         swap(out1, out2);
428                 }
429
430                 __m256i shuffle_cw = _mm256_set_epi8(
431                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
432                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
433                 while (in < (const __m256i *)limit) {
434                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
435                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
436                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
437
438                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
439                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
440                         __m256i found = _mm256_or_si256(found1, found2);
441
442                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
443                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
444                 
445                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
446                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
447
448                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
449                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
450
451                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
452                         _mm256_storeu_si256(out2, hi);
453
454                         if (!_mm256_testz_si256(found, found)) {
455                                 break;
456                         }
457
458                         in += 2;
459                         ++out1;
460                         ++out2;
461                 }
462                 current_frame->len += (uint8_t *)in - aligned_start;
463         } else {
464                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
465                 while (in < (const __m256i *)limit) {
466                         __m256i data = _mm256_load_si256(in);
467                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
468                         __m256i found = _mm256_cmpeq_epi8(data, needle);
469                         if (!_mm256_testz_si256(found, found)) {
470                                 break;
471                         }
472
473                         ++in;
474                         ++out;
475                 }
476                 current_frame->len = (uint8_t *)out - current_frame->data;
477         }
478 #else
479         const __m128i needle = _mm_set1_epi8(sync_char);
480
481         const __m128i *in = (const __m128i *)aligned_start;
482         if (current_frame->interleaved) {
483                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
484                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
485                 if (current_frame->len % 2 == 1) {
486                         swap(out1, out2);
487                 }
488
489                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
490                 while (in < (const __m128i *)limit) {
491                         __m128i data1 = _mm_load_si128(in);
492                         __m128i data2 = _mm_load_si128(in + 1);
493                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
494                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
495                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
496                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
497                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
498                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
499                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
500                         _mm_storeu_si128(out2, hi);
501                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
502                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
503                         if (!_mm_testz_si128(found1, found1) ||
504                             !_mm_testz_si128(found2, found2)) {
505                                 break;
506                         }
507
508                         in += 2;
509                         ++out1;
510                         ++out2;
511                 }
512                 current_frame->len += (uint8_t *)in - aligned_start;
513         } else {
514                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
515                 while (in < (const __m128i *)limit) {
516                         __m128i data = _mm_load_si128(in);
517                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
518                         __m128i found = _mm_cmpeq_epi8(data, needle);
519                         if (!_mm_testz_si128(found, found)) {
520                                 break;
521                         }
522
523                         ++in;
524                         ++out;
525                 }
526                 current_frame->len = (uint8_t *)out - current_frame->data;
527         }
528 #endif
529
530         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
531
532         return (const uint8_t *)in;
533 }
534 #endif
535
536 void decode_packs(const libusb_transfer *xfr,
537                   const char *sync_pattern,
538                   int sync_length,
539                   FrameAllocator::Frame *current_frame,
540                   const char *frame_type_name,
541                   function<void(const uint8_t *start)> start_callback)
542 {
543         int offset = 0;
544         for (int i = 0; i < xfr->num_iso_packets; i++) {
545                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
546
547                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
548                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
549                         continue;
550 //exit(5);
551                 }
552
553                 const uint8_t *start = xfr->buffer + offset;
554                 const uint8_t *limit = start + pack->actual_length;
555                 while (start < limit) {  // Usually runs only one iteration.
556 #ifdef __SSE4_1__
557                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
558                         if (start == limit) break;
559                         assert(start < limit);
560 #endif
561
562                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
563                         if (start_next_frame == nullptr) {
564                                 // add the rest of the buffer
565                                 add_to_frame(current_frame, frame_type_name, start, limit);
566                                 break;
567                         } else {
568                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
569                                 start = start_next_frame + sync_length;  // skip sync
570                                 start_callback(start);
571                         }
572                 }
573 #if 0
574                 dump_pack(xfr, offset, pack);
575 #endif
576                 offset += pack->length;
577         }
578 }
579
580 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
581 {
582         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
583                 fprintf(stderr, "transfer status %d\n", xfr->status);
584                 libusb_free_transfer(xfr);
585                 exit(3);
586         }
587
588         assert(xfr->user_data != nullptr);
589         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
590
591         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
592                 if (xfr->endpoint == 0x84) {
593                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
594                 } else {
595                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
596                 }
597         }
598         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
599                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
600                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
601 #if 0
602                 if (setup->wIndex == 44) {
603                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
604                 } else {
605                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
606                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
607                 }
608 #else
609                 memcpy(usb->register_file + usb->current_register, buf, 4);
610                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
611                 if (usb->current_register == 0) {
612                         // read through all of them
613                         printf("register dump:");
614                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
615                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
616                         }
617                         printf("\n");
618                 }
619                 libusb_fill_control_setup(xfr->buffer,
620                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
621                         /*index=*/usb->current_register, /*length=*/4);
622 #endif
623         }
624
625 #if 0
626         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
627         for (i = 0; i < xfr->actual_length; i++) {
628                 printf("%02x", xfr->buffer[i]);
629                 if (i % 16)
630                         printf("\n");
631                 else if (i % 8)
632                         printf("  ");
633                 else
634                         printf(" ");
635         }
636 #endif
637
638         int rc = libusb_submit_transfer(xfr);
639         if (rc < 0) {
640                 fprintf(stderr, "error re-submitting URB: %s\n", libusb_error_name(rc));
641                 exit(1);
642         }
643 }
644
645 void BMUSBCapture::usb_thread_func()
646 {
647         sched_param param;
648         memset(&param, 0, sizeof(param));
649         param.sched_priority = 1;
650         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
651                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
652         }
653         while (!should_quit) {
654                 int rc = libusb_handle_events(nullptr);
655                 if (rc != LIBUSB_SUCCESS)
656                         break;
657         }
658 }
659
660 struct USBCardDevice {
661         uint16_t product;
662         uint8_t bus, port;
663         libusb_device *device;
664 };
665
666 libusb_device_handle *open_card(int card_index)
667 {       
668         libusb_device **devices;
669         ssize_t num_devices = libusb_get_device_list(nullptr, &devices);
670         if (num_devices == -1) {
671                 fprintf(stderr, "Error finding USB devices\n");
672                 exit(1);
673         }
674         vector<USBCardDevice> found_cards;
675         for (ssize_t i = 0; i < num_devices; ++i) {
676                 libusb_device_descriptor desc;
677                 if (libusb_get_device_descriptor(devices[i], &desc) < 0) {
678                         fprintf(stderr, "Error getting device descriptor for device %d\n", int(i));
679                         exit(1);
680                 }
681
682                 uint8_t bus = libusb_get_bus_number(devices[i]);
683                 uint8_t port = libusb_get_port_number(devices[i]);
684
685                 if (!(desc.idVendor == 0x1edb && desc.idProduct == 0xbd3b) &&
686                     !(desc.idVendor == 0x1edb && desc.idProduct == 0xbd4f)) {
687                         libusb_unref_device(devices[i]);
688                         continue;
689                 }
690
691                 found_cards.push_back({ desc.idProduct, bus, port, devices[i] });
692         }
693         libusb_free_device_list(devices, 0);
694
695         // Sort the devices to get a consistent ordering.
696         sort(found_cards.begin(), found_cards.end(), [](const USBCardDevice &a, const USBCardDevice &b) {
697                 if (a.product != b.product)
698                         return a.product < b.product;
699                 if (a.bus != b.bus)
700                         return a.bus < b.bus;
701                 return a.port < b.port;
702         });
703
704         for (size_t i = 0; i < found_cards.size(); ++i) {
705                 fprintf(stderr, "Card %d: Bus %03u Device %03u ", int(i), found_cards[i].bus, found_cards[i].port);
706                 if (found_cards[i].product == 0xbd3b) {
707                         fprintf(stderr, "Intensity Shuttle\n");
708                 } else if (found_cards[i].product == 0xbd4f) {
709                         fprintf(stderr, "UltraStudio SDI\n");
710                 } else {
711                         assert(false);
712                 }
713         }
714
715         if (size_t(card_index) >= found_cards.size()) {
716                 fprintf(stderr, "Could not open card %d (only %d found)\n", card_index, int(found_cards.size()));
717                 exit(1);
718         }
719
720         libusb_device_handle *devh;
721         int rc = libusb_open(found_cards[card_index].device, &devh);
722         if (rc < 0) {
723                 fprintf(stderr, "Error opening card %d: %s\n", card_index, libusb_error_name(rc));
724                 exit(1);
725         }
726
727         for (size_t i = 0; i < found_cards.size(); ++i) {
728                 libusb_unref_device(found_cards[i].device);
729         }
730
731         return devh;
732 }
733
734 void BMUSBCapture::configure_card()
735 {
736         if (video_frame_allocator == nullptr) {
737                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE, NUM_QUEUED_VIDEO_FRAMES));  // FIXME: leak.
738         }
739         if (audio_frame_allocator == nullptr) {
740                 set_audio_frame_allocator(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));  // FIXME: leak.
741         }
742         dequeue_thread_should_quit = false;
743         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
744
745         int rc;
746         struct libusb_transfer *xfr;
747
748         rc = libusb_init(nullptr);
749         if (rc < 0) {
750                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
751                 exit(1);
752         }
753
754         libusb_device_handle *devh = open_card(card_index);
755         if (!devh) {
756                 fprintf(stderr, "Error finding USB device\n");
757                 exit(1);
758         }
759
760         libusb_config_descriptor *config;
761         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
762         if (rc < 0) {
763                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
764                 exit(1);
765         }
766         printf("%d interface\n", config->bNumInterfaces);
767         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
768                 printf("  interface %d\n", interface_number);
769                 const libusb_interface *interface = &config->interface[interface_number];
770                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
771                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
772                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
773                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
774                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
775                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
776                         }
777                 }
778         }
779
780         rc = libusb_set_configuration(devh, /*configuration=*/1);
781         if (rc < 0) {
782                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
783                 exit(1);
784         }
785
786         rc = libusb_claim_interface(devh, 0);
787         if (rc < 0) {
788                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
789                 exit(1);
790         }
791
792         // Alternate setting 1 is output, alternate setting 2 is input.
793         // Card is reset when switching alternates, so the driver uses
794         // this “double switch” when it wants to reset.
795         //
796         // There's also alternate settings 3 and 4, which seem to be
797         // like 1 and 2 except they advertise less bandwidth needed.
798         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
799         if (rc < 0) {
800                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
801                 exit(1);
802         }
803         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
804         if (rc < 0) {
805                 fprintf(stderr, "Error setting alternate 2: %s\n", libusb_error_name(rc));
806                 exit(1);
807         }
808 #if 0
809         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
810         if (rc < 0) {
811                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
812                 exit(1);
813         }
814 #endif
815
816 #if 0
817         rc = libusb_claim_interface(devh, 3);
818         if (rc < 0) {
819                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
820                 exit(1);
821         }
822 #endif
823
824         // theories:
825         //   44 is some kind of timer register (first 16 bits count upwards)
826         //   24 is some sort of watchdog?
827         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
828         //      (or will go to 0x73c60010?), also seen 0x73c60100
829         //   12 also changes all the time, unclear why  
830         //   16 seems to be autodetected mode somehow
831         //      --    this is e00115e0 after reset?
832         //                    ed0115e0 after mode change [to output?]
833         //                    2d0015e0 after more mode change [to input]
834         //                    ed0115e0 after more mode change
835         //                    2d0015e0 after more mode change
836         //
837         //                    390115e0 seems to indicate we have signal
838         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
839         //
840         //                    200015e0 on startup
841         //         changes to 250115e0 when we sync to the signal
842         //
843         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
844         //
845         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
846         //
847         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
848         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
849         //
850         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
851         //    perhaps some of them are related to analog output?
852         //
853         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
854         //    but the driver sets it to 0x8036802a at some point.
855         //
856         //    all of this is on request 214/215. other requests (192, 219,
857         //    222, 223, 224) are used for firmware upgrade. Probably best to
858         //    stay out of it unless you know what you're doing.
859         //
860         //
861         // register 16:
862         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
863         //
864         // theories:
865         //   0x01 - stable signal
866         //   0x04 - deep color
867         //   0x08 - unknown (audio??)
868         //   0x20 - 720p??
869         //   0x30 - 576p??
870
871         struct ctrl {
872                 int endpoint;
873                 int request;
874                 int index;
875                 uint32_t data;
876         };
877         static const ctrl ctrls[] = {
878                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
879                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
880
881                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
882                 // capture (v210).
883                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
884                 // 0x10000000 = analog audio instead of embedded audio, it seems
885                 // 0x3a000000 = component video? (analog audio)
886                 // 0x3c000000 = composite video? (analog audio)
887                 // 0x3e000000 = s-video? (analog audio)
888                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
889                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
890                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
891                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
892                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
893         };
894
895         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
896                 uint32_t flipped = htonl(ctrls[req].data);
897                 static uint8_t value[4];
898                 memcpy(value, &flipped, sizeof(flipped));
899                 int size = sizeof(value);
900                 //if (ctrls[req].request == 215) size = 0;
901                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
902                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
903                 if (rc < 0) {
904                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
905                         exit(1);
906                 }
907                 
908                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
909                 for (int i = 0; i < rc; ++i) {
910                         printf("%02x", value[i]);
911                 }
912                 printf("\n");
913         }
914
915 #if 0
916         // DEBUG
917         for ( ;; ) {
918                 static int my_index = 0;
919                 static uint8_t value[4];
920                 int size = sizeof(value);
921                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
922                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
923                 if (rc < 0) {
924                         fprintf(stderr, "Error on control\n");
925                         exit(1);
926                 }
927                 printf("rc=%d index=%d: 0x", rc, my_index);
928                 for (int i = 0; i < rc; ++i) {
929                         printf("%02x", value[i]);
930                 }
931                 printf("\n");
932         }
933 #endif
934
935 #if 0
936         // set up an asynchronous transfer of the timer register
937         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
938         static int completed = 0;
939
940         xfr = libusb_alloc_transfer(0);
941         libusb_fill_control_setup(cmdbuf,
942             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
943                 /*index=*/44, /*length=*/4);
944         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
945         xfr->user_data = this;
946         libusb_submit_transfer(xfr);
947
948         // set up an asynchronous transfer of register 24
949         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
950         static int completed2 = 0;
951
952         xfr = libusb_alloc_transfer(0);
953         libusb_fill_control_setup(cmdbuf2,
954             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
955                 /*index=*/24, /*length=*/4);
956         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
957         xfr->user_data = this;
958         libusb_submit_transfer(xfr);
959 #endif
960
961         // set up an asynchronous transfer of the register dump
962         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
963         static int completed3 = 0;
964
965         xfr = libusb_alloc_transfer(0);
966         libusb_fill_control_setup(cmdbuf3,
967             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
968                 /*index=*/current_register, /*length=*/4);
969         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
970         xfr->user_data = this;
971         //libusb_submit_transfer(xfr);
972
973         audiofp = fopen("audio.raw", "wb");
974
975         // set up isochronous transfers for audio and video
976         for (int e = 3; e <= 4; ++e) {
977                 //int num_transfers = (e == 3) ? 6 : 6;
978                 int num_transfers = 10;
979                 for (int i = 0; i < num_transfers; ++i) {
980                         int num_iso_pack, size;
981                         if (e == 3) {
982                                 // Video seems to require isochronous packets scaled with the width; 
983                                 // seemingly six lines is about right, rounded up to the required 1kB
984                                 // multiple.
985                                 size = WIDTH * 2 * 6;
986                                 // Note that for 10-bit input, you'll need to increase size accordingly.
987                                 //size = size * 4 / 3;
988                                 if (size % 1024 != 0) {
989                                         size &= ~1023;
990                                         size += 1024;
991                                 }
992                                 num_iso_pack = (2 << 16) / size;  // 128 kB.
993                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
994                         } else {
995                                 size = 0xc0;
996                                 num_iso_pack = 80;
997                         }
998                         int num_bytes = num_iso_pack * size;
999                         uint8_t *buf = new uint8_t[num_bytes];
1000
1001                         xfr = libusb_alloc_transfer(num_iso_pack);
1002                         if (!xfr) {
1003                                 fprintf(stderr, "oom\n");
1004                                 exit(1);
1005                         }
1006
1007                         int ep = LIBUSB_ENDPOINT_IN | e;
1008                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
1009                                 num_iso_pack, cb_xfr, nullptr, 0);
1010                         libusb_set_iso_packet_lengths(xfr, size);
1011                         xfr->user_data = this;
1012                         iso_xfrs.push_back(xfr);
1013                 }
1014         }
1015 }
1016
1017 void BMUSBCapture::start_bm_capture()
1018 {
1019         printf("starting capture\n");
1020         int i = 0;
1021         for (libusb_transfer *xfr : iso_xfrs) {
1022                 printf("submitting transfer...\n");
1023                 int rc = libusb_submit_transfer(xfr);
1024                 ++i;
1025                 if (rc < 0) {
1026                         //printf("num_bytes=%d\n", num_bytes);
1027                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
1028                                 xfr->endpoint, i, libusb_error_name(rc));
1029                         exit(1);
1030                 }
1031         }
1032
1033
1034 #if 0
1035         libusb_release_interface(devh, 0);
1036 out:
1037         if (devh)
1038                 libusb_close(devh);
1039         libusb_exit(nullptr);
1040         return rc;
1041 #endif
1042 }
1043
1044 void BMUSBCapture::stop_dequeue_thread()
1045 {
1046         dequeue_thread_should_quit = true;
1047         queues_not_empty.notify_all();
1048         dequeue_thread.join();
1049 }
1050
1051 void BMUSBCapture::start_bm_thread()
1052 {
1053         should_quit = false;
1054         usb_thread = thread(&BMUSBCapture::usb_thread_func);
1055 }
1056
1057 void BMUSBCapture::stop_bm_thread()
1058 {
1059         should_quit = true;
1060         usb_thread.join();
1061 }