]> git.sesse.net Git - nageru/blob - mixer.cpp
5f023cfd691eba9e0723e6f92c5ca4e6adb82519
[nageru] / mixer.cpp
1 #undef Success
2
3 #include "mixer.h"
4
5 #include <assert.h>
6 #include <epoxy/egl.h>
7 #include <movit/effect_chain.h>
8 #include <movit/effect_util.h>
9 #include <movit/flat_input.h>
10 #include <movit/image_format.h>
11 #include <movit/init.h>
12 #include <movit/resource_pool.h>
13 #include <movit/util.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <sys/time.h>
18 #include <time.h>
19 #include <algorithm>
20 #include <cmath>
21 #include <condition_variable>
22 #include <cstddef>
23 #include <memory>
24 #include <mutex>
25 #include <string>
26 #include <thread>
27 #include <utility>
28 #include <vector>
29
30 #include "bmusb/bmusb.h"
31 #include "context.h"
32 #include "defs.h"
33 #include "flags.h"
34 #include "h264encode.h"
35 #include "pbo_frame_allocator.h"
36 #include "ref_counted_gl_sync.h"
37 #include "timebase.h"
38
39 class QOpenGLContext;
40
41 using namespace movit;
42 using namespace std;
43 using namespace std::placeholders;
44
45 Mixer *global_mixer = nullptr;
46
47 namespace {
48
49 void convert_fixed24_to_fp32(float *dst, size_t out_channels, const uint8_t *src, size_t in_channels, size_t num_samples)
50 {
51         for (size_t i = 0; i < num_samples; ++i) {
52                 for (size_t j = 0; j < out_channels; ++j) {
53                         uint32_t s1 = *src++;
54                         uint32_t s2 = *src++;
55                         uint32_t s3 = *src++;
56                         uint32_t s = s1 | (s1 << 8) | (s2 << 16) | (s3 << 24);
57                         dst[i * out_channels + j] = int(s) * (1.0f / 4294967296.0f);
58                 }
59                 src += 3 * (in_channels - out_channels);
60         }
61 }
62
63 void insert_new_frame(RefCountedFrame frame, unsigned field_num, bool interlaced, unsigned card_index, InputState *input_state)
64 {
65         if (interlaced) {
66                 for (unsigned frame_num = FRAME_HISTORY_LENGTH; frame_num --> 1; ) {  // :-)
67                         input_state->buffered_frames[card_index][frame_num] =
68                                 input_state->buffered_frames[card_index][frame_num - 1];
69                 }
70                 input_state->buffered_frames[card_index][0] = { frame, field_num };
71         } else {
72                 for (unsigned frame_num = 0; frame_num < FRAME_HISTORY_LENGTH; ++frame_num) {
73                         input_state->buffered_frames[card_index][frame_num] = { frame, field_num };
74                 }
75         }
76 }
77
78 string generate_local_dump_filename(int frame)
79 {
80         time_t now = time(NULL);
81         tm now_tm;
82         localtime_r(&now, &now_tm);
83
84         char timestamp[256];
85         strftime(timestamp, sizeof(timestamp), "%F-%T%z", &now_tm);
86
87         // Use the frame number to disambiguate between two cuts starting
88         // on the same second.
89         char filename[256];
90         snprintf(filename, sizeof(filename), "%s%s-f%02d%s",
91                 LOCAL_DUMP_PREFIX, timestamp, frame % 100, LOCAL_DUMP_SUFFIX);
92         return filename;
93 }
94
95 }  // namespace
96
97 Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
98         : httpd(WIDTH, HEIGHT),
99           num_cards(num_cards),
100           mixer_surface(create_surface(format)),
101           h264_encoder_surface(create_surface(format)),
102           correlation(OUTPUT_FREQUENCY),
103           level_compressor(OUTPUT_FREQUENCY),
104           limiter(OUTPUT_FREQUENCY),
105           compressor(OUTPUT_FREQUENCY)
106 {
107         httpd.open_output_file(generate_local_dump_filename(/*frame=*/0).c_str());
108         httpd.start(9095);
109
110         CHECK(init_movit(MOVIT_SHADER_DIR, MOVIT_DEBUG_OFF));
111         check_error();
112
113         // Since we allow non-bouncing 4:2:2 YCbCrInputs, effective subpixel precision
114         // will be halved when sampling them, and we need to compensate here.
115         movit_texel_subpixel_precision /= 2.0;
116
117         resource_pool.reset(new ResourcePool);
118         theme.reset(new Theme("theme.lua", resource_pool.get(), num_cards));
119         for (unsigned i = 0; i < NUM_OUTPUTS; ++i) {
120                 output_channel[i].parent = this;
121         }
122
123         ImageFormat inout_format;
124         inout_format.color_space = COLORSPACE_sRGB;
125         inout_format.gamma_curve = GAMMA_sRGB;
126
127         // Display chain; shows the live output produced by the main chain (its RGBA version).
128         display_chain.reset(new EffectChain(WIDTH, HEIGHT, resource_pool.get()));
129         check_error();
130         display_input = new FlatInput(inout_format, FORMAT_RGB, GL_UNSIGNED_BYTE, WIDTH, HEIGHT);  // FIXME: GL_UNSIGNED_BYTE is really wrong.
131         display_chain->add_input(display_input);
132         display_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
133         display_chain->set_dither_bits(0);  // Don't bother.
134         display_chain->finalize();
135
136         h264_encoder.reset(new H264Encoder(h264_encoder_surface, global_flags.va_display, WIDTH, HEIGHT, &httpd));
137
138         for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
139                 configure_card(card_index, format, new BMUSBCapture(card_index));
140         }
141
142         BMUSBCapture::start_bm_thread();
143
144         for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
145                 cards[card_index].capture->start_bm_capture();
146         }
147
148         // Set up stuff for NV12 conversion.
149
150         // Cb/Cr shader.
151         string cbcr_vert_shader =
152                 "#version 130 \n"
153                 " \n"
154                 "in vec2 position; \n"
155                 "in vec2 texcoord; \n"
156                 "out vec2 tc0; \n"
157                 "uniform vec2 foo_chroma_offset_0; \n"
158                 " \n"
159                 "void main() \n"
160                 "{ \n"
161                 "    // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is: \n"
162                 "    // \n"
163                 "    //   2.000  0.000  0.000 -1.000 \n"
164                 "    //   0.000  2.000  0.000 -1.000 \n"
165                 "    //   0.000  0.000 -2.000 -1.000 \n"
166                 "    //   0.000  0.000  0.000  1.000 \n"
167                 "    gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0); \n"
168                 "    vec2 flipped_tc = texcoord; \n"
169                 "    tc0 = flipped_tc + foo_chroma_offset_0; \n"
170                 "} \n";
171         string cbcr_frag_shader =
172                 "#version 130 \n"
173                 "in vec2 tc0; \n"
174                 "uniform sampler2D cbcr_tex; \n"
175                 "out vec4 FragColor; \n"
176                 "void main() { \n"
177                 "    FragColor = texture(cbcr_tex, tc0); \n"
178                 "} \n";
179         vector<string> frag_shader_outputs;
180         cbcr_program_num = resource_pool->compile_glsl_program(cbcr_vert_shader, cbcr_frag_shader, frag_shader_outputs);
181
182         float vertices[] = {
183                 0.0f, 2.0f,
184                 0.0f, 0.0f,
185                 2.0f, 0.0f
186         };
187         cbcr_vbo = generate_vbo(2, GL_FLOAT, sizeof(vertices), vertices);
188         cbcr_position_attribute_index = glGetAttribLocation(cbcr_program_num, "position");
189         cbcr_texcoord_attribute_index = glGetAttribLocation(cbcr_program_num, "texcoord");
190
191         r128.init(2, OUTPUT_FREQUENCY);
192         r128.integr_start();
193
194         locut.init(FILTER_HPF, 2);
195
196         // hlen=16 is pretty low quality, but we use quite a bit of CPU otherwise,
197         // and there's a limit to how important the peak meter is.
198         peak_resampler.setup(OUTPUT_FREQUENCY, OUTPUT_FREQUENCY * 4, /*num_channels=*/2, /*hlen=*/16, /*frel=*/1.0);
199
200         alsa.reset(new ALSAOutput(OUTPUT_FREQUENCY, /*num_channels=*/2));
201 }
202
203 Mixer::~Mixer()
204 {
205         resource_pool->release_glsl_program(cbcr_program_num);
206         glDeleteBuffers(1, &cbcr_vbo);
207         BMUSBCapture::stop_bm_thread();
208
209         for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
210                 {
211                         unique_lock<mutex> lock(bmusb_mutex);
212                         cards[card_index].should_quit = true;  // Unblock thread.
213                         cards[card_index].new_data_ready_changed.notify_all();
214                 }
215                 cards[card_index].capture->stop_dequeue_thread();
216         }
217
218         h264_encoder.reset(nullptr);
219 }
220
221 void Mixer::configure_card(unsigned card_index, const QSurfaceFormat &format, CaptureInterface *capture)
222 {
223         printf("Configuring card %d...\n", card_index);
224
225         CaptureCard *card = &cards[card_index];
226         card->capture = capture;
227         card->capture->set_frame_callback(bind(&Mixer::bm_frame, this, card_index, _1, _2, _3, _4, _5, _6, _7));
228         card->frame_allocator.reset(new PBOFrameAllocator(8 << 20, WIDTH, HEIGHT));  // 8 MB.
229         card->capture->set_video_frame_allocator(card->frame_allocator.get());
230         card->surface = create_surface(format);
231         card->capture->set_dequeue_thread_callbacks(
232                 [card]{
233                         eglBindAPI(EGL_OPENGL_API);
234                         card->context = create_context(card->surface);
235                         if (!make_current(card->context, card->surface)) {
236                                 printf("failed to create bmusb context\n");
237                                 exit(1);
238                         }
239                 },
240                 [this]{
241                         resource_pool->clean_context();
242                 });
243         card->resampling_queue.reset(new ResamplingQueue(OUTPUT_FREQUENCY, OUTPUT_FREQUENCY, 2));
244         card->capture->configure_card();
245 }
246
247
248 namespace {
249
250 int unwrap_timecode(uint16_t current_wrapped, int last)
251 {
252         uint16_t last_wrapped = last & 0xffff;
253         if (current_wrapped > last_wrapped) {
254                 return (last & ~0xffff) | current_wrapped;
255         } else {
256                 return 0x10000 + ((last & ~0xffff) | current_wrapped);
257         }
258 }
259
260 float find_peak(const float *samples, size_t num_samples)
261 {
262         float m = fabs(samples[0]);
263         for (size_t i = 1; i < num_samples; ++i) {
264                 m = max(m, fabs(samples[i]));
265         }
266         return m;
267 }
268
269 void deinterleave_samples(const vector<float> &in, vector<float> *out_l, vector<float> *out_r)
270 {
271         size_t num_samples = in.size() / 2;
272         out_l->resize(num_samples);
273         out_r->resize(num_samples);
274
275         const float *inptr = in.data();
276         float *lptr = &(*out_l)[0];
277         float *rptr = &(*out_r)[0];
278         for (size_t i = 0; i < num_samples; ++i) {
279                 *lptr++ = *inptr++;
280                 *rptr++ = *inptr++;
281         }
282 }
283
284 }  // namespace
285
286 void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
287                      FrameAllocator::Frame video_frame, size_t video_offset, VideoFormat video_format,
288                      FrameAllocator::Frame audio_frame, size_t audio_offset, uint16_t audio_format)
289 {
290         CaptureCard *card = &cards[card_index];
291
292         int64_t frame_length = int64_t(TIMEBASE * video_format.frame_rate_den) / video_format.frame_rate_nom;
293
294         size_t num_samples = (audio_frame.len >= audio_offset) ? (audio_frame.len - audio_offset) / 8 / 3 : 0;
295         if (num_samples > OUTPUT_FREQUENCY / 10) {
296                 printf("Card %d: Dropping frame with implausible audio length (len=%d, offset=%d) [timecode=0x%04x video_len=%d video_offset=%d video_format=%x)\n",
297                         card_index, int(audio_frame.len), int(audio_offset),
298                         timecode, int(video_frame.len), int(video_offset), video_format.id);
299                 if (video_frame.owner) {
300                         video_frame.owner->release_frame(video_frame);
301                 }
302                 if (audio_frame.owner) {
303                         audio_frame.owner->release_frame(audio_frame);
304                 }
305                 return;
306         }
307
308         int64_t local_pts = card->next_local_pts;
309         int dropped_frames = 0;
310         if (card->last_timecode != -1) {
311                 dropped_frames = unwrap_timecode(timecode, card->last_timecode) - card->last_timecode - 1;
312         }
313
314         // Convert the audio to stereo fp32 and add it.
315         vector<float> audio;
316         audio.resize(num_samples * 2);
317         convert_fixed24_to_fp32(&audio[0], 2, audio_frame.data + audio_offset, 8, num_samples);
318
319         // Add the audio.
320         {
321                 unique_lock<mutex> lock(card->audio_mutex);
322
323                 // Number of samples per frame if we need to insert silence.
324                 // (Could be nonintegral, but resampling will save us then.)
325                 int silence_samples = OUTPUT_FREQUENCY * video_format.frame_rate_den / video_format.frame_rate_nom;
326
327                 if (dropped_frames > MAX_FPS * 2) {
328                         fprintf(stderr, "Card %d lost more than two seconds (or time code jumping around; from 0x%04x to 0x%04x), resetting resampler\n",
329                                 card_index, card->last_timecode, timecode);
330                         card->resampling_queue.reset(new ResamplingQueue(OUTPUT_FREQUENCY, OUTPUT_FREQUENCY, 2));
331                         dropped_frames = 0;
332                 } else if (dropped_frames > 0) {
333                         // Insert silence as needed.
334                         fprintf(stderr, "Card %d dropped %d frame(s) (before timecode 0x%04x), inserting silence.\n",
335                                 card_index, dropped_frames, timecode);
336                         vector<float> silence(silence_samples * 2, 0.0f);
337                         for (int i = 0; i < dropped_frames; ++i) {
338                                 card->resampling_queue->add_input_samples(local_pts / double(TIMEBASE), silence.data(), silence_samples);
339                                 // Note that if the format changed in the meantime, we have
340                                 // no way of detecting that; we just have to assume the frame length
341                                 // is always the same.
342                                 local_pts += frame_length;
343                         }
344                 }
345                 if (num_samples == 0) {
346                         audio.resize(silence_samples * 2);
347                         num_samples = silence_samples;
348                 }
349                 card->resampling_queue->add_input_samples(local_pts / double(TIMEBASE), audio.data(), num_samples);
350                 card->next_local_pts = local_pts + frame_length;
351         }
352
353         card->last_timecode = timecode;
354
355         // Done with the audio, so release it.
356         if (audio_frame.owner) {
357                 audio_frame.owner->release_frame(audio_frame);
358         }
359
360         {
361                 // Wait until the previous frame was consumed.
362                 unique_lock<mutex> lock(bmusb_mutex);
363                 card->new_data_ready_changed.wait(lock, [card]{ return !card->new_data_ready || card->should_quit; });
364                 if (card->should_quit) return;
365         }
366
367         size_t expected_length = video_format.width * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom) * 2;
368         if (video_frame.len - video_offset == 0 ||
369             video_frame.len - video_offset != expected_length) {
370                 if (video_frame.len != 0) {
371                         printf("Card %d: Dropping video frame with wrong length (%ld; expected %ld)\n",
372                                 card_index, video_frame.len - video_offset, expected_length);
373                 }
374                 if (video_frame.owner) {
375                         video_frame.owner->release_frame(video_frame);
376                 }
377
378                 // Still send on the information that we _had_ a frame, even though it's corrupted,
379                 // so that pts can go up accordingly.
380                 {
381                         unique_lock<mutex> lock(bmusb_mutex);
382                         card->new_data_ready = true;
383                         card->new_frame = RefCountedFrame(FrameAllocator::Frame());
384                         card->new_frame_length = frame_length;
385                         card->new_frame_interlaced = false;
386                         card->new_data_ready_fence = nullptr;
387                         card->dropped_frames = dropped_frames;
388                         card->new_data_ready_changed.notify_all();
389                 }
390                 return;
391         }
392
393         PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)video_frame.userdata;
394
395         unsigned num_fields = video_format.interlaced ? 2 : 1;
396         timespec frame_upload_start;
397         if (video_format.interlaced) {
398                 // Send the two fields along as separate frames; the other side will need to add
399                 // a deinterlacer to actually get this right.
400                 assert(video_format.height % 2 == 0);
401                 video_format.height /= 2;
402                 assert(frame_length % 2 == 0);
403                 frame_length /= 2;
404                 num_fields = 2;
405                 clock_gettime(CLOCK_MONOTONIC, &frame_upload_start);
406         }
407         userdata->last_interlaced = video_format.interlaced;
408         userdata->last_frame_rate_nom = video_format.frame_rate_nom;
409         userdata->last_frame_rate_den = video_format.frame_rate_den;
410         RefCountedFrame new_frame(video_frame);
411
412         // Upload the textures.
413         size_t cbcr_width = video_format.width / 2;
414         size_t cbcr_offset = video_offset / 2;
415         size_t y_offset = video_frame.size / 2 + video_offset / 2;
416
417         for (unsigned field = 0; field < num_fields; ++field) {
418                 unsigned field_start_line = (field == 1) ? video_format.second_field_start : video_format.extra_lines_top + field * (video_format.height + 22);
419
420                 if (userdata->tex_y[field] == 0 ||
421                     userdata->tex_cbcr[field] == 0 ||
422                     video_format.width != userdata->last_width[field] ||
423                     video_format.height != userdata->last_height[field]) {
424                         // We changed resolution since last use of this texture, so we need to create
425                         // a new object. Note that this each card has its own PBOFrameAllocator,
426                         // we don't need to worry about these flip-flopping between resolutions.
427                         glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
428                         check_error();
429                         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, video_format.height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
430                         check_error();
431                         glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
432                         check_error();
433                         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, video_format.width, video_format.height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
434                         check_error();
435                         userdata->last_width[field] = video_format.width;
436                         userdata->last_height[field] = video_format.height;
437                 }
438
439                 GLuint pbo = userdata->pbo;
440                 check_error();
441                 glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
442                 check_error();
443                 glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
444                 check_error();
445
446                 glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
447                 check_error();
448                 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, cbcr_width, video_format.height, GL_RG, GL_UNSIGNED_BYTE, BUFFER_OFFSET(cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t)));
449                 check_error();
450                 glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
451                 check_error();
452                 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, video_format.width, video_format.height, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(y_offset + video_format.width * field_start_line));
453                 check_error();
454                 glBindTexture(GL_TEXTURE_2D, 0);
455                 check_error();
456                 glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
457                 check_error();
458                 GLsync fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);
459                 check_error();
460                 assert(fence != nullptr);
461
462                 if (field == 1) {
463                         // Don't upload the second field as fast as we can; wait until
464                         // the field time has approximately passed. (Otherwise, we could
465                         // get timing jitter against the other sources, and possibly also
466                         // against the video display, although the latter is not as critical.)
467                         // This requires our system clock to be reasonably close to the
468                         // video clock, but that's not an unreasonable assumption.
469                         timespec second_field_start;
470                         second_field_start.tv_nsec = frame_upload_start.tv_nsec +
471                                 frame_length * 1000000000 / TIMEBASE;
472                         second_field_start.tv_sec = frame_upload_start.tv_sec +
473                                 second_field_start.tv_nsec / 1000000000;
474                         second_field_start.tv_nsec %= 1000000000;
475
476                         while (clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME,
477                                                &second_field_start, nullptr) == -1 &&
478                                errno == EINTR) ;
479                 }
480
481                 {
482                         unique_lock<mutex> lock(bmusb_mutex);
483                         card->new_data_ready = true;
484                         card->new_frame = new_frame;
485                         card->new_frame_length = frame_length;
486                         card->new_frame_field = field;
487                         card->new_frame_interlaced = video_format.interlaced;
488                         card->new_data_ready_fence = fence;
489                         card->dropped_frames = dropped_frames;
490                         card->new_data_ready_changed.notify_all();
491
492                         if (field != num_fields - 1) {
493                                 // Wait until the previous frame was consumed.
494                                 card->new_data_ready_changed.wait(lock, [card]{ return !card->new_data_ready || card->should_quit; });
495                                 if (card->should_quit) return;
496                         }
497                 }
498         }
499 }
500
501 void Mixer::thread_func()
502 {
503         eglBindAPI(EGL_OPENGL_API);
504         QOpenGLContext *context = create_context(mixer_surface);
505         if (!make_current(context, mixer_surface)) {
506                 printf("oops\n");
507                 exit(1);
508         }
509
510         struct timespec start, now;
511         clock_gettime(CLOCK_MONOTONIC, &start);
512
513         int frame = 0;
514         int stats_dropped_frames = 0;
515
516         while (!should_quit) {
517                 CaptureCard card_copy[MAX_CARDS];
518                 int num_samples[MAX_CARDS];
519
520                 {
521                         unique_lock<mutex> lock(bmusb_mutex);
522
523                         // The first card is the master timer, so wait for it to have a new frame.
524                         // TODO: Make configurable, and with a timeout.
525                         cards[0].new_data_ready_changed.wait(lock, [this]{ return cards[0].new_data_ready; });
526
527                         for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
528                                 CaptureCard *card = &cards[card_index];
529                                 card_copy[card_index].new_data_ready = card->new_data_ready;
530                                 card_copy[card_index].new_frame = card->new_frame;
531                                 card_copy[card_index].new_frame_length = card->new_frame_length;
532                                 card_copy[card_index].new_frame_field = card->new_frame_field;
533                                 card_copy[card_index].new_frame_interlaced = card->new_frame_interlaced;
534                                 card_copy[card_index].new_data_ready_fence = card->new_data_ready_fence;
535                                 card_copy[card_index].dropped_frames = card->dropped_frames;
536                                 card->new_data_ready = false;
537                                 card->new_data_ready_changed.notify_all();
538
539                                 int num_samples_times_timebase = OUTPUT_FREQUENCY * card->new_frame_length + card->fractional_samples;
540                                 num_samples[card_index] = num_samples_times_timebase / TIMEBASE;
541                                 card->fractional_samples = num_samples_times_timebase % TIMEBASE;
542                                 assert(num_samples[card_index] >= 0);
543                         }
544                 }
545
546                 // Resample the audio as needed, including from previously dropped frames.
547                 assert(num_cards > 0);
548                 for (unsigned frame_num = 0; frame_num < card_copy[0].dropped_frames + 1; ++frame_num) {
549                         {
550                                 // Signal to the audio thread to process this frame.
551                                 unique_lock<mutex> lock(audio_mutex);
552                                 audio_task_queue.push(AudioTask{pts_int, num_samples[0]});
553                                 audio_task_queue_changed.notify_one();
554                         }
555                         if (frame_num != card_copy[0].dropped_frames) {
556                                 // For dropped frames, increase the pts. Note that if the format changed
557                                 // in the meantime, we have no way of detecting that; we just have to
558                                 // assume the frame length is always the same.
559                                 ++stats_dropped_frames;
560                                 pts_int += card_copy[0].new_frame_length;
561                         }
562                 }
563
564                 if (audio_level_callback != nullptr) {
565                         unique_lock<mutex> lock(compressor_mutex);
566                         double loudness_s = r128.loudness_S();
567                         double loudness_i = r128.integrated();
568                         double loudness_range_low = r128.range_min();
569                         double loudness_range_high = r128.range_max();
570
571                         audio_level_callback(loudness_s, 20.0 * log10(peak),
572                                              loudness_i, loudness_range_low, loudness_range_high,
573                                              gain_staging_db, 20.0 * log10(final_makeup_gain),
574                                              correlation.get_correlation());
575                 }
576
577                 for (unsigned card_index = 1; card_index < num_cards; ++card_index) {
578                         if (card_copy[card_index].new_data_ready && card_copy[card_index].new_frame->len == 0) {
579                                 ++card_copy[card_index].dropped_frames;
580                         }
581                         if (card_copy[card_index].dropped_frames > 0) {
582                                 printf("Card %u dropped %d frames before this\n",
583                                         card_index, int(card_copy[card_index].dropped_frames));
584                         }
585                 }
586
587                 // If the first card is reporting a corrupted or otherwise dropped frame,
588                 // just increase the pts (skipping over this frame) and don't try to compute anything new.
589                 if (card_copy[0].new_frame->len == 0) {
590                         ++stats_dropped_frames;
591                         pts_int += card_copy[0].new_frame_length;
592                         continue;
593                 }
594
595                 for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
596                         CaptureCard *card = &card_copy[card_index];
597                         if (!card->new_data_ready || card->new_frame->len == 0)
598                                 continue;
599
600                         assert(card->new_frame != nullptr);
601                         insert_new_frame(card->new_frame, card->new_frame_field, card->new_frame_interlaced, card_index, &input_state);
602                         check_error();
603
604                         // The new texture might still be uploaded,
605                         // tell the GPU to wait until it's there.
606                         if (card->new_data_ready_fence) {
607                                 glWaitSync(card->new_data_ready_fence, /*flags=*/0, GL_TIMEOUT_IGNORED);
608                                 check_error();
609                                 glDeleteSync(card->new_data_ready_fence);
610                                 check_error();
611                         }
612                 }
613
614                 // Get the main chain from the theme, and set its state immediately.
615                 Theme::Chain theme_main_chain = theme->get_chain(0, pts(), WIDTH, HEIGHT, input_state);
616                 EffectChain *chain = theme_main_chain.chain;
617                 theme_main_chain.setup_chain();
618                 //theme_main_chain.chain->enable_phase_timing(true);
619
620                 GLuint y_tex, cbcr_tex;
621                 bool got_frame = h264_encoder->begin_frame(&y_tex, &cbcr_tex);
622                 assert(got_frame);
623
624                 // Render main chain.
625                 GLuint cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, WIDTH, HEIGHT);
626                 GLuint rgba_tex = resource_pool->create_2d_texture(GL_RGB565, WIDTH, HEIGHT);  // Saves texture bandwidth, although dithering gets messed up.
627                 GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, rgba_tex);
628                 check_error();
629                 chain->render_to_fbo(fbo, WIDTH, HEIGHT);
630                 resource_pool->release_fbo(fbo);
631
632                 subsample_chroma(cbcr_full_tex, cbcr_tex);
633                 resource_pool->release_2d_texture(cbcr_full_tex);
634
635                 // Set the right state for rgba_tex.
636                 glBindFramebuffer(GL_FRAMEBUFFER, 0);
637                 glBindTexture(GL_TEXTURE_2D, rgba_tex);
638                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
639                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
640                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
641
642                 RefCountedGLsync fence(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);
643                 check_error();
644
645                 const int64_t av_delay = TIMEBASE / 10;  // Corresponds to the fixed delay in resampling_queue.h. TODO: Make less hard-coded.
646                 h264_encoder->end_frame(fence, pts_int + av_delay, theme_main_chain.input_frames);
647                 ++frame;
648                 pts_int += card_copy[0].new_frame_length;
649
650                 // The live frame just shows the RGBA texture we just rendered.
651                 // It owns rgba_tex now.
652                 DisplayFrame live_frame;
653                 live_frame.chain = display_chain.get();
654                 live_frame.setup_chain = [this, rgba_tex]{
655                         display_input->set_texture_num(rgba_tex);
656                 };
657                 live_frame.ready_fence = fence;
658                 live_frame.input_frames = {};
659                 live_frame.temp_textures = { rgba_tex };
660                 output_channel[OUTPUT_LIVE].output_frame(live_frame);
661
662                 // Set up preview and any additional channels.
663                 for (int i = 1; i < theme->get_num_channels() + 2; ++i) {
664                         DisplayFrame display_frame;
665                         Theme::Chain chain = theme->get_chain(i, pts(), WIDTH, HEIGHT, input_state);  // FIXME: dimensions
666                         display_frame.chain = chain.chain;
667                         display_frame.setup_chain = chain.setup_chain;
668                         display_frame.ready_fence = fence;
669                         display_frame.input_frames = chain.input_frames;
670                         display_frame.temp_textures = {};
671                         output_channel[i].output_frame(display_frame);
672                 }
673
674                 clock_gettime(CLOCK_MONOTONIC, &now);
675                 double elapsed = now.tv_sec - start.tv_sec +
676                         1e-9 * (now.tv_nsec - start.tv_nsec);
677                 if (frame % 100 == 0) {
678                         printf("%d frames (%d dropped) in %.3f seconds = %.1f fps (%.1f ms/frame)\n",
679                                 frame, stats_dropped_frames, elapsed, frame / elapsed,
680                                 1e3 * elapsed / frame);
681                 //      chain->print_phase_timing();
682                 }
683
684                 if (should_cut.exchange(false)) {  // Test and clear.
685                         string filename = generate_local_dump_filename(frame);
686                         printf("Starting new recording: %s\n", filename.c_str());
687                         h264_encoder->shutdown();
688                         httpd.close_output_file();
689                         httpd.open_output_file(filename.c_str());
690                         h264_encoder.reset(new H264Encoder(h264_encoder_surface, global_flags.va_display, WIDTH, HEIGHT, &httpd));
691                 }
692
693 #if 0
694                 // Reset every 100 frames, so that local variations in frame times
695                 // (especially for the first few frames, when the shaders are
696                 // compiled etc.) don't make it hard to measure for the entire
697                 // remaining duration of the program.
698                 if (frame == 10000) {
699                         frame = 0;
700                         start = now;
701                 }
702 #endif
703                 check_error();
704         }
705
706         resource_pool->clean_context();
707 }
708
709 void Mixer::audio_thread_func()
710 {
711         while (!should_quit) {
712                 AudioTask task;
713
714                 {
715                         unique_lock<mutex> lock(audio_mutex);
716                         audio_task_queue_changed.wait(lock, [this]{ return !audio_task_queue.empty(); });
717                         task = audio_task_queue.front();
718                         audio_task_queue.pop();
719                 }
720
721                 process_audio_one_frame(task.pts_int, task.num_samples);
722         }
723 }
724
725 void Mixer::process_audio_one_frame(int64_t frame_pts_int, int num_samples)
726 {
727         vector<float> samples_card;
728         vector<float> samples_out;
729
730         // TODO: Allow mixing audio from several sources.
731         unsigned selected_audio_card = theme->map_signal(audio_source_channel);
732         assert(selected_audio_card < num_cards);
733
734         for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
735                 samples_card.resize(num_samples * 2);
736                 {
737                         unique_lock<mutex> lock(cards[card_index].audio_mutex);
738                         if (!cards[card_index].resampling_queue->get_output_samples(double(frame_pts_int) / TIMEBASE, &samples_card[0], num_samples)) {
739                                 printf("Card %d reported previous underrun.\n", card_index);
740                         }
741                 }
742                 if (card_index == selected_audio_card) {
743                         samples_out = move(samples_card);
744                 }
745         }
746
747         // Cut away everything under 120 Hz (or whatever the cutoff is);
748         // we don't need it for voice, and it will reduce headroom
749         // and confuse the compressor. (In particular, any hums at 50 or 60 Hz
750         // should be dampened.)
751         if (locut_enabled) {
752                 locut.render(samples_out.data(), samples_out.size() / 2, locut_cutoff_hz * 2.0 * M_PI / OUTPUT_FREQUENCY, 0.5f);
753         }
754
755         // Apply a level compressor to get the general level right.
756         // Basically, if it's over about -40 dBFS, we squeeze it down to that level
757         // (or more precisely, near it, since we don't use infinite ratio),
758         // then apply a makeup gain to get it to -14 dBFS. -14 dBFS is, of course,
759         // entirely arbitrary, but from practical tests with speech, it seems to
760         // put ut around -23 LUFS, so it's a reasonable starting point for later use.
761         {
762                 unique_lock<mutex> lock(compressor_mutex);
763                 if (level_compressor_enabled) {
764                         float threshold = 0.01f;   // -40 dBFS.
765                         float ratio = 20.0f;
766                         float attack_time = 0.5f;
767                         float release_time = 20.0f;
768                         float makeup_gain = pow(10.0f, (ref_level_dbfs - (-40.0f)) / 20.0f);  // +26 dB.
769                         level_compressor.process(samples_out.data(), samples_out.size() / 2, threshold, ratio, attack_time, release_time, makeup_gain);
770                         gain_staging_db = 20.0 * log10(level_compressor.get_attenuation() * makeup_gain);
771                 } else {
772                         // Just apply the gain we already had.
773                         float g = pow(10.0f, gain_staging_db / 20.0f);
774                         for (size_t i = 0; i < samples_out.size(); ++i) {
775                                 samples_out[i] *= g;
776                         }
777                 }
778         }
779
780 #if 0
781         printf("level=%f (%+5.2f dBFS) attenuation=%f (%+5.2f dB) end_result=%+5.2f dB\n",
782                 level_compressor.get_level(), 20.0 * log10(level_compressor.get_level()),
783                 level_compressor.get_attenuation(), 20.0 * log10(level_compressor.get_attenuation()),
784                 20.0 * log10(level_compressor.get_level() * level_compressor.get_attenuation() * makeup_gain));
785 #endif
786
787 //      float limiter_att, compressor_att;
788
789         // The real compressor.
790         if (compressor_enabled) {
791                 float threshold = pow(10.0f, compressor_threshold_dbfs / 20.0f);
792                 float ratio = 20.0f;
793                 float attack_time = 0.005f;
794                 float release_time = 0.040f;
795                 float makeup_gain = 2.0f;  // +6 dB.
796                 compressor.process(samples_out.data(), samples_out.size() / 2, threshold, ratio, attack_time, release_time, makeup_gain);
797 //              compressor_att = compressor.get_attenuation();
798         }
799
800         // Finally a limiter at -4 dB (so, -10 dBFS) to take out the worst peaks only.
801         // Note that since ratio is not infinite, we could go slightly higher than this.
802         if (limiter_enabled) {
803                 float threshold = pow(10.0f, limiter_threshold_dbfs / 20.0f);
804                 float ratio = 30.0f;
805                 float attack_time = 0.0f;  // Instant.
806                 float release_time = 0.020f;
807                 float makeup_gain = 1.0f;  // 0 dB.
808                 limiter.process(samples_out.data(), samples_out.size() / 2, threshold, ratio, attack_time, release_time, makeup_gain);
809 //              limiter_att = limiter.get_attenuation();
810         }
811
812 //      printf("limiter=%+5.1f  compressor=%+5.1f\n", 20.0*log10(limiter_att), 20.0*log10(compressor_att));
813
814         // Upsample 4x to find interpolated peak.
815         peak_resampler.inp_data = samples_out.data();
816         peak_resampler.inp_count = samples_out.size() / 2;
817
818         vector<float> interpolated_samples_out;
819         interpolated_samples_out.resize(samples_out.size());
820         while (peak_resampler.inp_count > 0) {  // About four iterations.
821                 peak_resampler.out_data = &interpolated_samples_out[0];
822                 peak_resampler.out_count = interpolated_samples_out.size() / 2;
823                 peak_resampler.process();
824                 size_t out_stereo_samples = interpolated_samples_out.size() / 2 - peak_resampler.out_count;
825                 peak = max<float>(peak, find_peak(interpolated_samples_out.data(), out_stereo_samples * 2));
826                 peak_resampler.out_data = nullptr;
827         }
828
829         // At this point, we are most likely close to +0 LU, but all of our
830         // measurements have been on raw sample values, not R128 values.
831         // So we have a final makeup gain to get us to +0 LU; the gain
832         // adjustments required should be relatively small, and also, the
833         // offset shouldn't change much (only if the type of audio changes
834         // significantly). Thus, we shoot for updating this value basically
835         // “whenever we process buffers”, since the R128 calculation isn't exactly
836         // something we get out per-sample.
837         //
838         // Note that there's a feedback loop here, so we choose a very slow filter
839         // (half-time of 100 seconds).
840         double target_loudness_factor, alpha;
841         {
842                 unique_lock<mutex> lock(compressor_mutex);
843                 double loudness_lu = r128.loudness_M() - ref_level_lufs;
844                 double current_makeup_lu = 20.0f * log10(final_makeup_gain);
845                 target_loudness_factor = pow(10.0f, -loudness_lu / 20.0f);
846
847                 // If we're outside +/- 5 LU uncorrected, we don't count it as
848                 // a normal signal (probably silence) and don't change the
849                 // correction factor; just apply what we already have.
850                 if (fabs(loudness_lu - current_makeup_lu) >= 5.0 || !final_makeup_gain_auto) {
851                         alpha = 0.0;
852                 } else {
853                         // Formula adapted from
854                         // https://en.wikipedia.org/wiki/Low-pass_filter#Simple_infinite_impulse_response_filter.
855                         const double half_time_s = 100.0;
856                         const double fc_mul_2pi_delta_t = 1.0 / (half_time_s * OUTPUT_FREQUENCY);
857                         alpha = fc_mul_2pi_delta_t / (fc_mul_2pi_delta_t + 1.0);
858                 }
859
860                 double m = final_makeup_gain;
861                 for (size_t i = 0; i < samples_out.size(); i += 2) {
862                         samples_out[i + 0] *= m;
863                         samples_out[i + 1] *= m;
864                         m += (target_loudness_factor - m) * alpha;
865                 }
866                 final_makeup_gain = m;
867         }
868
869         // Find R128 levels and L/R correlation.
870         vector<float> left, right;
871         deinterleave_samples(samples_out, &left, &right);
872         float *ptrs[] = { left.data(), right.data() };
873         {
874                 unique_lock<mutex> lock(compressor_mutex);
875                 r128.process(left.size(), ptrs);
876                 correlation.process_samples(samples_out);
877         }
878
879         // Send the samples to the sound card.
880         if (alsa) {
881                 alsa->write(samples_out);
882         }
883
884         // And finally add them to the output.
885         h264_encoder->add_audio(frame_pts_int, move(samples_out));
886 }
887
888 void Mixer::subsample_chroma(GLuint src_tex, GLuint dst_tex)
889 {
890         GLuint vao;
891         glGenVertexArrays(1, &vao);
892         check_error();
893
894         glBindVertexArray(vao);
895         check_error();
896
897         // Extract Cb/Cr.
898         GLuint fbo = resource_pool->create_fbo(dst_tex);
899         glBindFramebuffer(GL_FRAMEBUFFER, fbo);
900         glViewport(0, 0, WIDTH/2, HEIGHT/2);
901         check_error();
902
903         glUseProgram(cbcr_program_num);
904         check_error();
905
906         glActiveTexture(GL_TEXTURE0);
907         check_error();
908         glBindTexture(GL_TEXTURE_2D, src_tex);
909         check_error();
910         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
911         check_error();
912         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
913         check_error();
914         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
915         check_error();
916
917         float chroma_offset_0[] = { -0.5f / WIDTH, 0.0f };
918         set_uniform_vec2(cbcr_program_num, "foo", "chroma_offset_0", chroma_offset_0);
919
920         glBindBuffer(GL_ARRAY_BUFFER, cbcr_vbo);
921         check_error();
922
923         for (GLint attr_index : { cbcr_position_attribute_index, cbcr_texcoord_attribute_index }) {
924                 glEnableVertexAttribArray(attr_index);
925                 check_error();
926                 glVertexAttribPointer(attr_index, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
927                 check_error();
928         }
929
930         glDrawArrays(GL_TRIANGLES, 0, 3);
931         check_error();
932
933         for (GLint attr_index : { cbcr_position_attribute_index, cbcr_texcoord_attribute_index }) {
934                 glDisableVertexAttribArray(attr_index);
935                 check_error();
936         }
937
938         glUseProgram(0);
939         check_error();
940         glBindFramebuffer(GL_FRAMEBUFFER, 0);
941         check_error();
942
943         resource_pool->release_fbo(fbo);
944         glDeleteVertexArrays(1, &vao);
945 }
946
947 void Mixer::release_display_frame(DisplayFrame *frame)
948 {
949         for (GLuint texnum : frame->temp_textures) {
950                 resource_pool->release_2d_texture(texnum);
951         }
952         frame->temp_textures.clear();
953         frame->ready_fence.reset();
954         frame->input_frames.clear();
955 }
956
957 void Mixer::start()
958 {
959         mixer_thread = thread(&Mixer::thread_func, this);
960         audio_thread = thread(&Mixer::audio_thread_func, this);
961 }
962
963 void Mixer::quit()
964 {
965         should_quit = true;
966         mixer_thread.join();
967         audio_thread.join();
968 }
969
970 void Mixer::transition_clicked(int transition_num)
971 {
972         theme->transition_clicked(transition_num, pts());
973 }
974
975 void Mixer::channel_clicked(int preview_num)
976 {
977         theme->channel_clicked(preview_num);
978 }
979
980 void Mixer::reset_meters()
981 {
982         peak_resampler.reset();
983         peak = 0.0f;
984         r128.reset();
985         r128.integr_start();
986         correlation.reset();
987 }
988
989 Mixer::OutputChannel::~OutputChannel()
990 {
991         if (has_current_frame) {
992                 parent->release_display_frame(&current_frame);
993         }
994         if (has_ready_frame) {
995                 parent->release_display_frame(&ready_frame);
996         }
997 }
998
999 void Mixer::OutputChannel::output_frame(DisplayFrame frame)
1000 {
1001         // Store this frame for display. Remove the ready frame if any
1002         // (it was seemingly never used).
1003         {
1004                 unique_lock<mutex> lock(frame_mutex);
1005                 if (has_ready_frame) {
1006                         parent->release_display_frame(&ready_frame);
1007                 }
1008                 ready_frame = frame;
1009                 has_ready_frame = true;
1010         }
1011
1012         if (has_new_frame_ready_callback) {
1013                 new_frame_ready_callback();
1014         }
1015 }
1016
1017 bool Mixer::OutputChannel::get_display_frame(DisplayFrame *frame)
1018 {
1019         unique_lock<mutex> lock(frame_mutex);
1020         if (!has_current_frame && !has_ready_frame) {
1021                 return false;
1022         }
1023
1024         if (has_current_frame && has_ready_frame) {
1025                 // We have a new ready frame. Toss the current one.
1026                 parent->release_display_frame(&current_frame);
1027                 has_current_frame = false;
1028         }
1029         if (has_ready_frame) {
1030                 assert(!has_current_frame);
1031                 current_frame = ready_frame;
1032                 ready_frame.ready_fence.reset();  // Drop the refcount.
1033                 ready_frame.input_frames.clear();  // Drop the refcounts.
1034                 has_current_frame = true;
1035                 has_ready_frame = false;
1036         }
1037
1038         *frame = current_frame;
1039         return true;
1040 }
1041
1042 void Mixer::OutputChannel::set_frame_ready_callback(Mixer::new_frame_ready_callback_t callback)
1043 {
1044         new_frame_ready_callback = callback;
1045         has_new_frame_ready_callback = true;
1046 }