]> git.sesse.net Git - nageru/blob - bmusb.cpp
Support arbitrary SBS zooming (ie., zooming in on the little picture, too).
[nageru] / bmusb.cpp
1 // TODO: Replace with linking to upstream bmusb.
2
3 // Intensity Shuttle USB3 prototype capture driver, v0.3
4 // Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
5 // (can do captures for hours at a time with no drops), except during startup
6 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
7 // Audio comes out as 8-channel 24-bit raw audio.
8
9 #include <assert.h>
10 #include <errno.h>
11 #include <libusb.h>
12 #include <netinet/in.h>
13 #include <sched.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #ifdef __SSE4_1__
19 #include <immintrin.h>
20 #endif
21 #include "bmusb.h"
22
23 #include <algorithm>
24 #include <atomic>
25 #include <condition_variable>
26 #include <cstddef>
27 #include <cstdint>
28 #include <deque>
29 #include <functional>
30 #include <memory>
31 #include <mutex>
32 #include <stack>
33 #include <thread>
34
35 using namespace std;
36 using namespace std::placeholders;
37
38 #define WIDTH 1280
39 #define HEIGHT 750  /* 30 lines ancillary data? */
40 //#define WIDTH 1920
41 //#define HEIGHT 1125  /* ??? lines ancillary data? */
42 #define HEADER_SIZE 44
43 //#define HEADER_SIZE 0
44 #define AUDIO_HEADER_SIZE 4
45
46 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
47 //#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
48 #define FRAME_SIZE (8 << 20)
49
50 FILE *audiofp;
51
52 thread usb_thread;
53 atomic<bool> should_quit;
54
55 FrameAllocator::~FrameAllocator() {}
56
57 #define NUM_QUEUED_FRAMES 16
58 class MallocFrameAllocator : public FrameAllocator {
59 public:
60         MallocFrameAllocator(size_t frame_size);
61         Frame alloc_frame() override;
62         void release_frame(Frame frame) override;
63
64 private:
65         size_t frame_size;
66
67         mutex freelist_mutex;
68         stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
69 };
70
71 MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
72         : frame_size(frame_size)
73 {
74         for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
75                 freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
76         }
77 }
78
79 FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
80 {
81         Frame vf;
82         vf.owner = this;
83
84         unique_lock<mutex> lock(freelist_mutex);  // Meh.
85         if (freelist.empty()) {
86                 printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
87                         frame_size);
88         } else {
89                 vf.data = freelist.top().release();
90                 vf.size = frame_size;
91                 freelist.pop();  // Meh.
92         }
93         return vf;
94 }
95
96 void MallocFrameAllocator::release_frame(Frame frame)
97 {
98         unique_lock<mutex> lock(freelist_mutex);
99         freelist.push(unique_ptr<uint8_t[]>(frame.data));
100 }
101
102 bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
103 {
104         if (a == b) {
105                 return false;
106         } else if (a < b) {
107                 return (b - a < 0x8000);
108         } else {
109                 int wrap_b = 0x10000 + int(b);
110                 return (wrap_b - a < 0x8000);
111         }
112 }
113
114 void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
115 {
116         if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
117                 printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
118                         q->back().timecode, timecode);
119                 frame.owner->release_frame(frame);
120                 return;
121         }
122
123         QueuedFrame qf;
124         qf.format = format;
125         qf.timecode = timecode;
126         qf.frame = frame;
127
128         {
129                 unique_lock<mutex> lock(queue_lock);
130                 q->push_back(move(qf));
131         }
132         queues_not_empty.notify_one();  // might be spurious
133 }
134
135 void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
136 {
137         FILE *fp = fopen(filename, "wb");
138         if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
139                 printf("short write!\n");
140         }
141         fclose(fp);
142 }
143
144 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
145 {
146         fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
147 }
148
149 void BMUSBCapture::dequeue_thread_func()
150 {
151         if (has_dequeue_callbacks) {
152                 dequeue_init_callback();
153         }
154         while (!dequeue_thread_should_quit) {
155                 unique_lock<mutex> lock(queue_lock);
156                 queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
157
158                 uint16_t video_timecode = pending_video_frames.front().timecode;
159                 uint16_t audio_timecode = pending_audio_frames.front().timecode;
160                 if (video_timecode < audio_timecode) {
161                         printf("Video block 0x%04x without corresponding audio block, dropping.\n",
162                                 video_timecode);
163                         video_frame_allocator->release_frame(pending_video_frames.front().frame);
164                         pending_video_frames.pop_front();
165                 } else if (audio_timecode < video_timecode) {
166                         printf("Audio block 0x%04x without corresponding video block, dropping.\n",
167                                 audio_timecode);
168                         audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
169                         pending_audio_frames.pop_front();
170                 } else {
171                         QueuedFrame video_frame = pending_video_frames.front();
172                         QueuedFrame audio_frame = pending_audio_frames.front();
173                         pending_audio_frames.pop_front();
174                         pending_video_frames.pop_front();
175                         lock.unlock();
176
177 #if 0
178                         char filename[255];
179                         snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
180                         dump_frame(filename, video_frame.frame.data, video_frame.data_len);
181                         dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
182 #endif
183
184                         frame_callback(video_timecode,
185                                        video_frame.frame, HEADER_SIZE, video_frame.format,
186                                        audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
187                 }
188         }
189         if (has_dequeue_callbacks) {
190                 dequeue_cleanup_callback();
191         }
192 }
193
194 void BMUSBCapture::start_new_frame(const uint8_t *start)
195 {
196         uint16_t format = (start[3] << 8) | start[2];
197         uint16_t timecode = (start[1] << 8) | start[0];
198
199         if (current_video_frame.len > 0) {
200                 //dump_frame();
201                 queue_frame(format, timecode, current_video_frame, &pending_video_frames);
202         }
203         //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
204         //      format, timecode,
205         //      //start[7], start[6], start[5], start[4],
206         //      read_current_frame, FRAME_SIZE);
207
208         current_video_frame = video_frame_allocator->alloc_frame();
209         //if (current_video_frame.data == nullptr) {
210         //      read_current_frame = -1;
211         //} else {
212         //      read_current_frame = 0;
213         //}
214 }
215
216 void BMUSBCapture::start_new_audio_block(const uint8_t *start)
217 {
218         uint16_t format = (start[3] << 8) | start[2];
219         uint16_t timecode = (start[1] << 8) | start[0];
220         if (current_audio_frame.len > 0) {
221                 //dump_audio_block();
222                 queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
223         }
224         //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
225         //      format, timecode, read_current_audio_block);
226         current_audio_frame = audio_frame_allocator->alloc_frame();
227 }
228
229 #if 0
230 static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
231 {
232         //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
233         for (unsigned j = 0; j < pack->actual_length; j++) {
234         //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
235                 printf("%02x", xfr->buffer[j + offset]);
236                 if ((j % 16) == 15)
237                         printf("\n");
238                 else if ((j % 8) == 7)
239                         printf("  ");
240                 else
241                         printf(" ");
242         }
243 }
244 #endif
245
246 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
247 {
248         assert(n % 2 == 0);
249         uint8_t *dptr1 = dest1;
250         uint8_t *dptr2 = dest2;
251
252         for (size_t i = 0; i < n; i += 2) {
253                 *dptr1++ = *src++;
254                 *dptr2++ = *src++;
255         }
256 }
257
258 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
259 {
260         if (current_frame->data == nullptr ||
261             current_frame->len > current_frame->size ||
262             start == end) {
263                 return;
264         }
265
266         int bytes = end - start;
267         if (current_frame->len + bytes > current_frame->size) {
268                 printf("%d bytes overflow after last %s frame\n",
269                         int(current_frame->len + bytes - current_frame->size), frame_type_name);
270                 //dump_frame();
271         } else {
272                 if (current_frame->interleaved) {
273                         uint8_t *data = current_frame->data + current_frame->len / 2;
274                         uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
275                         if (current_frame->len % 2 == 1) {
276                                 ++data;
277                                 swap(data, data2);
278                         }
279                         if (bytes % 2 == 1) {
280                                 *data++ = *start++;
281                                 swap(data, data2);
282                                 ++current_frame->len;
283                                 --bytes;
284                         }
285                         memcpy_interleaved(data, data2, start, bytes);
286                         current_frame->len += bytes;
287                 } else {
288                         memcpy(current_frame->data + current_frame->len, start, bytes);
289                         current_frame->len += bytes;
290                 }
291         }
292 }
293
294 #ifdef __SSE4_1__
295
296 #if 0
297 void avx2_dump(const char *name, __m256i n)
298 {
299         printf("%-10s:", name);
300         printf(" %02x", _mm256_extract_epi8(n, 0));
301         printf(" %02x", _mm256_extract_epi8(n, 1));
302         printf(" %02x", _mm256_extract_epi8(n, 2));
303         printf(" %02x", _mm256_extract_epi8(n, 3));
304         printf(" %02x", _mm256_extract_epi8(n, 4));
305         printf(" %02x", _mm256_extract_epi8(n, 5));
306         printf(" %02x", _mm256_extract_epi8(n, 6));
307         printf(" %02x", _mm256_extract_epi8(n, 7));
308         printf(" ");
309         printf(" %02x", _mm256_extract_epi8(n, 8));
310         printf(" %02x", _mm256_extract_epi8(n, 9));
311         printf(" %02x", _mm256_extract_epi8(n, 10));
312         printf(" %02x", _mm256_extract_epi8(n, 11));
313         printf(" %02x", _mm256_extract_epi8(n, 12));
314         printf(" %02x", _mm256_extract_epi8(n, 13));
315         printf(" %02x", _mm256_extract_epi8(n, 14));
316         printf(" %02x", _mm256_extract_epi8(n, 15));
317         printf(" ");
318         printf(" %02x", _mm256_extract_epi8(n, 16));
319         printf(" %02x", _mm256_extract_epi8(n, 17));
320         printf(" %02x", _mm256_extract_epi8(n, 18));
321         printf(" %02x", _mm256_extract_epi8(n, 19));
322         printf(" %02x", _mm256_extract_epi8(n, 20));
323         printf(" %02x", _mm256_extract_epi8(n, 21));
324         printf(" %02x", _mm256_extract_epi8(n, 22));
325         printf(" %02x", _mm256_extract_epi8(n, 23));
326         printf(" ");
327         printf(" %02x", _mm256_extract_epi8(n, 24));
328         printf(" %02x", _mm256_extract_epi8(n, 25));
329         printf(" %02x", _mm256_extract_epi8(n, 26));
330         printf(" %02x", _mm256_extract_epi8(n, 27));
331         printf(" %02x", _mm256_extract_epi8(n, 28));
332         printf(" %02x", _mm256_extract_epi8(n, 29));
333         printf(" %02x", _mm256_extract_epi8(n, 30));
334         printf(" %02x", _mm256_extract_epi8(n, 31));
335         printf("\n");
336 }
337 #endif
338
339 // Does a memcpy and memchr in one to reduce processing time.
340 // Note that the benefit is somewhat limited if your L3 cache is small,
341 // as you'll (unfortunately) spend most of the time loading the data
342 // from main memory.
343 //
344 // Complicated cases are left to the slow path; it basically stops copying
345 // up until the first instance of "sync_char" (usually a bit before, actually).
346 // This is fine, since 0x00 bytes shouldn't really show up in normal picture
347 // data, and what we really need this for is the 00 00 ff ff marker in video data.
348 const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
349 {
350         if (current_frame->data == nullptr ||
351             current_frame->len > current_frame->size ||
352             start == limit) {
353                 return start;
354         }
355         size_t orig_bytes = limit - start;
356         if (orig_bytes < 128) {
357                 // Don't bother.
358                 return start;
359         }
360
361         // Don't read more bytes than we can write.
362         limit = min(limit, start + (current_frame->size - current_frame->len));
363
364         // Align end to 32 bytes.
365         limit = (const uint8_t *)(intptr_t(limit) & ~31);
366
367         if (start >= limit) {
368                 return start;
369         }
370
371         // Process [0,31] bytes, such that start gets aligned to 32 bytes.
372         const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
373         if (aligned_start != start) {
374                 const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
375                 if (sync_start == nullptr) {
376                         add_to_frame(current_frame, "", start, aligned_start);
377                 } else {
378                         add_to_frame(current_frame, "", start, sync_start);
379                         return sync_start;
380                 }
381         }
382
383         // Make the length a multiple of 64.
384         if (current_frame->interleaved) {
385                 if (((limit - aligned_start) % 64) != 0) {
386                         limit -= 32;
387                 }
388                 assert(((limit - aligned_start) % 64) == 0);
389         }
390
391 #if __AVX2__
392         const __m256i needle = _mm256_set1_epi8(sync_char);
393
394         const __restrict __m256i *in = (const __m256i *)aligned_start;
395         if (current_frame->interleaved) {
396                 __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
397                 __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
398                 if (current_frame->len % 2 == 1) {
399                         swap(out1, out2);
400                 }
401
402                 __m256i shuffle_cw = _mm256_set_epi8(
403                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
404                         15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
405                 while (in < (const __m256i *)limit) {
406                         // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
407                         __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
408                         __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
409
410                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
411                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
412                         __m256i found = _mm256_or_si256(found1, found2);
413
414                         data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
415                         data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
416                 
417                         data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
418                         data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
419
420                         __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
421                         __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
422
423                         _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
424                         _mm256_storeu_si256(out2, hi);
425
426                         if (!_mm256_testz_si256(found, found)) {
427                                 break;
428                         }
429
430                         in += 2;
431                         ++out1;
432                         ++out2;
433                 }
434                 current_frame->len += (uint8_t *)in - aligned_start;
435         } else {
436                 __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
437                 while (in < (const __m256i *)limit) {
438                         __m256i data = _mm256_load_si256(in);
439                         _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
440                         __m256i found = _mm256_cmpeq_epi8(data, needle);
441                         if (!_mm256_testz_si256(found, found)) {
442                                 break;
443                         }
444
445                         ++in;
446                         ++out;
447                 }
448                 current_frame->len = (uint8_t *)out - current_frame->data;
449         }
450 #else
451         const __m128i needle = _mm_set1_epi8(sync_char);
452
453         const __m128i *in = (const __m128i *)aligned_start;
454         if (current_frame->interleaved) {
455                 __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
456                 __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
457                 if (current_frame->len % 2 == 1) {
458                         swap(out1, out2);
459                 }
460
461                 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
462                 while (in < (const __m128i *)limit) {
463                         __m128i data1 = _mm_load_si128(in);
464                         __m128i data2 = _mm_load_si128(in + 1);
465                         __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
466                         __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
467                         __m128i data1_hi = _mm_srli_epi16(data1, 8);
468                         __m128i data2_hi = _mm_srli_epi16(data2, 8);
469                         __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
470                         _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
471                         __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
472                         _mm_storeu_si128(out2, hi);
473                         __m128i found1 = _mm_cmpeq_epi8(data1, needle);
474                         __m128i found2 = _mm_cmpeq_epi8(data2, needle);
475                         if (!_mm_testz_si128(found1, found1) ||
476                             !_mm_testz_si128(found2, found2)) {
477                                 break;
478                         }
479
480                         in += 2;
481                         ++out1;
482                         ++out2;
483                 }
484                 current_frame->len += (uint8_t *)in - aligned_start;
485         } else {
486                 __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
487                 while (in < (const __m128i *)limit) {
488                         __m128i data = _mm_load_si128(in);
489                         _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
490                         __m128i found = _mm_cmpeq_epi8(data, needle);
491                         if (!_mm_testz_si128(found, found)) {
492                                 break;
493                         }
494
495                         ++in;
496                         ++out;
497                 }
498                 current_frame->len = (uint8_t *)out - current_frame->data;
499         }
500 #endif
501
502         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
503
504         return (const uint8_t *)in;
505 }
506 #endif
507
508 void decode_packs(const libusb_transfer *xfr,
509                   const char *sync_pattern,
510                   int sync_length,
511                   FrameAllocator::Frame *current_frame,
512                   const char *frame_type_name,
513                   function<void(const uint8_t *start)> start_callback)
514 {
515         int offset = 0;
516         for (int i = 0; i < xfr->num_iso_packets; i++) {
517                 const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
518
519                 if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
520                         fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
521                         continue;
522 //exit(5);
523                 }
524
525                 const uint8_t *start = xfr->buffer + offset;
526                 const uint8_t *limit = start + pack->actual_length;
527                 while (start < limit) {  // Usually runs only one iteration.
528 #ifdef __SSE4_1__
529                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
530                         if (start == limit) break;
531                         assert(start < limit);
532 #endif
533
534                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
535                         if (start_next_frame == nullptr) {
536                                 // add the rest of the buffer
537                                 add_to_frame(current_frame, frame_type_name, start, limit);
538                                 break;
539                         } else {
540                                 add_to_frame(current_frame, frame_type_name, start, start_next_frame);
541                                 start = start_next_frame + sync_length;  // skip sync
542                                 start_callback(start);
543                         }
544                 }
545 #if 0
546                 dump_pack(xfr, offset, pack);
547 #endif
548                 offset += pack->length;
549         }
550 }
551
552 void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
553 {
554         if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
555                 fprintf(stderr, "transfer status %d\n", xfr->status);
556                 libusb_free_transfer(xfr);
557                 exit(3);
558         }
559
560         assert(xfr->user_data != nullptr);
561         BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
562
563         if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
564                 if (xfr->endpoint == 0x84) {
565                         decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
566                 } else {
567                         decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
568                 }
569         }
570         if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
571                 //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
572                 uint8_t *buf = libusb_control_transfer_get_data(xfr);
573 #if 0
574                 if (setup->wIndex == 44) {
575                         printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
576                 } else {
577                         printf("read register %2d:                      0x%02x%02x%02x%02x\n",
578                                 setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
579                 }
580 #else
581                 memcpy(usb->register_file + usb->current_register, buf, 4);
582                 usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
583                 if (usb->current_register == 0) {
584                         // read through all of them
585                         printf("register dump:");
586                         for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
587                                 printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
588                         }
589                         printf("\n");
590                 }
591                 libusb_fill_control_setup(xfr->buffer,
592                     LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
593                         /*index=*/usb->current_register, /*length=*/4);
594 #endif
595         }
596
597 #if 0
598         printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
599         for (i = 0; i < xfr->actual_length; i++) {
600                 printf("%02x", xfr->buffer[i]);
601                 if (i % 16)
602                         printf("\n");
603                 else if (i % 8)
604                         printf("  ");
605                 else
606                         printf(" ");
607         }
608 #endif
609
610         if (libusb_submit_transfer(xfr) < 0) {
611                 fprintf(stderr, "error re-submitting URB\n");
612                 exit(1);
613         }
614 }
615
616 void BMUSBCapture::usb_thread_func()
617 {
618         sched_param param;
619         memset(&param, 0, sizeof(param));
620         param.sched_priority = 1;
621         if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
622                 printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
623         }
624         while (!should_quit) {
625                 int rc = libusb_handle_events(nullptr);
626                 if (rc != LIBUSB_SUCCESS)
627                         break;
628         }
629 }
630
631 void BMUSBCapture::configure_card()
632 {
633         if (video_frame_allocator == nullptr) {
634                 set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
635         }
636         if (audio_frame_allocator == nullptr) {
637                 set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
638         }
639         dequeue_thread_should_quit = false;
640         dequeue_thread = thread(&BMUSBCapture::dequeue_thread_func, this);
641
642         int rc;
643         struct libusb_transfer *xfr;
644
645         rc = libusb_init(nullptr);
646         if (rc < 0) {
647                 fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
648                 exit(1);
649         }
650
651         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
652         //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
653         struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
654         if (!devh) {
655                 fprintf(stderr, "Error finding USB device\n");
656                 exit(1);
657         }
658
659         libusb_config_descriptor *config;
660         rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
661         if (rc < 0) {
662                 fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
663                 exit(1);
664         }
665         printf("%d interface\n", config->bNumInterfaces);
666         for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
667                 printf("  interface %d\n", interface_number);
668                 const libusb_interface *interface = &config->interface[interface_number];
669                 for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
670                         const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
671                         printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
672                         for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
673                                 const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
674                                 printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
675                         }
676                 }
677         }
678
679         rc = libusb_set_configuration(devh, /*configuration=*/1);
680         if (rc < 0) {
681                 fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
682                 exit(1);
683         }
684
685         rc = libusb_claim_interface(devh, 0);
686         if (rc < 0) {
687                 fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
688                 exit(1);
689         }
690
691         // Alternate setting 1 is output, alternate setting 2 is input.
692         // Card is reset when switching alternates, so the driver uses
693         // this “double switch” when it wants to reset.
694         //
695         // There's also alternate settings 3 and 4, which seem to be
696         // like 1 and 2 except they advertise less bandwidth needed.
697         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
698         if (rc < 0) {
699                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
700                 exit(1);
701         }
702         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/2);
703         if (rc < 0) {
704                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
705                 exit(1);
706         }
707 #if 0
708         rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
709         if (rc < 0) {
710                 fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
711                 exit(1);
712         }
713 #endif
714
715 #if 0
716         rc = libusb_claim_interface(devh, 3);
717         if (rc < 0) {
718                 fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
719                 exit(1);
720         }
721 #endif
722
723         // theories:
724         //   44 is some kind of timer register (first 16 bits count upwards)
725         //   24 is some sort of watchdog?
726         //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
727         //      (or will go to 0x73c60010?), also seen 0x73c60100
728         //   12 also changes all the time, unclear why  
729         //   16 seems to be autodetected mode somehow
730         //      --    this is e00115e0 after reset?
731         //                    ed0115e0 after mode change [to output?]
732         //                    2d0015e0 after more mode change [to input]
733         //                    ed0115e0 after more mode change
734         //                    2d0015e0 after more mode change
735         //
736         //                    390115e0 seems to indicate we have signal
737         //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
738         //
739         //                    200015e0 on startup
740         //         changes to 250115e0 when we sync to the signal
741         //
742         //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
743         //
744         //    Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
745         //
746         //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
747         //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
748         //
749         //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
750         //    perhaps some of them are related to analog output?
751         //
752         //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
753         //    but the driver sets it to 0x8036802a at some point.
754         //
755         //    all of this is on request 214/215. other requests (192, 219,
756         //    222, 223, 224) are used for firmware upgrade. Probably best to
757         //    stay out of it unless you know what you're doing.
758         //
759         //
760         // register 16:
761         // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
762         //
763         // theories:
764         //   0x01 - stable signal
765         //   0x04 - deep color
766         //   0x08 - unknown (audio??)
767         //   0x20 - 720p??
768         //   0x30 - 576p??
769
770         struct ctrl {
771                 int endpoint;
772                 int request;
773                 int index;
774                 uint32_t data;
775         };
776         static const ctrl ctrls[] = {
777                 { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
778                 { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
779
780                 // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
781                 // capture (v210).
782                 // clearing the 0x08000000 bit seems to change the capture format (other source?)
783                 // 0x10000000 = analog audio instead of embedded audio, it seems
784                 // 0x3a000000 = component video? (analog audio)
785                 // 0x3c000000 = composite video? (analog audio)
786                 // 0x3e000000 = s-video? (analog audio)
787                 { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
788                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
789                 //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
790                 { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
791                 { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
792         };
793
794         for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
795                 uint32_t flipped = htonl(ctrls[req].data);
796                 static uint8_t value[4];
797                 memcpy(value, &flipped, sizeof(flipped));
798                 int size = sizeof(value);
799                 //if (ctrls[req].request == 215) size = 0;
800                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
801                         /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
802                 if (rc < 0) {
803                         fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
804                         exit(1);
805                 }
806                 
807                 printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
808                 for (int i = 0; i < rc; ++i) {
809                         printf("%02x", value[i]);
810                 }
811                 printf("\n");
812         }
813
814 #if 0
815         // DEBUG
816         for ( ;; ) {
817                 static int my_index = 0;
818                 static uint8_t value[4];
819                 int size = sizeof(value);
820                 rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
821                         /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
822                 if (rc < 0) {
823                         fprintf(stderr, "Error on control\n");
824                         exit(1);
825                 }
826                 printf("rc=%d index=%d: 0x", rc, my_index);
827                 for (int i = 0; i < rc; ++i) {
828                         printf("%02x", value[i]);
829                 }
830                 printf("\n");
831         }
832 #endif
833
834 #if 0
835         // set up an asynchronous transfer of the timer register
836         static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
837         static int completed = 0;
838
839         xfr = libusb_alloc_transfer(0);
840         libusb_fill_control_setup(cmdbuf,
841             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
842                 /*index=*/44, /*length=*/4);
843         libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
844         xfr->user_data = this;
845         libusb_submit_transfer(xfr);
846
847         // set up an asynchronous transfer of register 24
848         static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
849         static int completed2 = 0;
850
851         xfr = libusb_alloc_transfer(0);
852         libusb_fill_control_setup(cmdbuf2,
853             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
854                 /*index=*/24, /*length=*/4);
855         libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
856         xfr->user_data = this;
857         libusb_submit_transfer(xfr);
858 #endif
859
860         // set up an asynchronous transfer of the register dump
861         static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
862         static int completed3 = 0;
863
864         xfr = libusb_alloc_transfer(0);
865         libusb_fill_control_setup(cmdbuf3,
866             LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
867                 /*index=*/current_register, /*length=*/4);
868         libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
869         xfr->user_data = this;
870         //libusb_submit_transfer(xfr);
871
872         audiofp = fopen("audio.raw", "wb");
873
874         // set up isochronous transfers for audio and video
875         for (int e = 3; e <= 4; ++e) {
876                 //int num_transfers = (e == 3) ? 6 : 6;
877                 int num_transfers = 6;
878                 for (int i = 0; i < num_transfers; ++i) {
879                         int num_iso_pack, size;
880                         if (e == 3) {
881                                 // Video seems to require isochronous packets scaled with the width; 
882                                 // seemingly six lines is about right, rounded up to the required 1kB
883                                 // multiple.
884                                 size = WIDTH * 2 * 6;
885                                 // Note that for 10-bit input, you'll need to increase size accordingly.
886                                 //size = size * 4 / 3;
887                                 if (size % 1024 != 0) {
888                                         size &= ~1023;
889                                         size += 1024;
890                                 }
891                                 num_iso_pack = (2 << 18) / size;  // 512 kB.
892                                 printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
893                         } else {
894                                 size = 0xc0;
895                                 num_iso_pack = 80;
896                         }
897                         int num_bytes = num_iso_pack * size;
898                         uint8_t *buf = new uint8_t[num_bytes];
899
900                         xfr = libusb_alloc_transfer(num_iso_pack);
901                         if (!xfr) {
902                                 fprintf(stderr, "oom\n");
903                                 exit(1);
904                         }
905
906                         int ep = LIBUSB_ENDPOINT_IN | e;
907                         libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
908                                 num_iso_pack, cb_xfr, nullptr, 0);
909                         libusb_set_iso_packet_lengths(xfr, size);
910                         xfr->user_data = this;
911                         iso_xfrs.push_back(xfr);
912                 }
913         }
914 }
915
916 void BMUSBCapture::start_bm_capture()
917 {
918         printf("starting capture\n");
919         int i = 0;
920         for (libusb_transfer *xfr : iso_xfrs) {
921                 printf("submitting transfer...\n");
922                 int rc = libusb_submit_transfer(xfr);
923                 ++i;
924                 if (rc < 0) {
925                         //printf("num_bytes=%d\n", num_bytes);
926                         fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
927                                 xfr->endpoint, i, libusb_error_name(rc));
928                         exit(1);
929                 }
930         }
931
932
933 #if 0
934         libusb_release_interface(devh, 0);
935 out:
936         if (devh)
937                 libusb_close(devh);
938         libusb_exit(nullptr);
939         return rc;
940 #endif
941 }
942
943 void BMUSBCapture::stop_dequeue_thread()
944 {
945         dequeue_thread_should_quit = true;
946         queues_not_empty.notify_all();  
947         dequeue_thread.join();
948 }
949
950 void BMUSBCapture::start_bm_thread()
951 {
952         should_quit = false;
953         usb_thread = thread(&BMUSBCapture::usb_thread_func);
954 }
955
956 void BMUSBCapture::stop_bm_thread()
957 {
958         should_quit = true;
959         usb_thread.join();
960 }