]> git.sesse.net Git - nageru/commitdiff
Initial checkin.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 3 Oct 2015 00:23:14 +0000 (02:23 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 3 Oct 2015 00:23:19 +0000 (02:23 +0200)
19 files changed:
Makefile [new file with mode: 0644]
bmusb.cpp [new file with mode: 0644]
bmusb.h [new file with mode: 0644]
context.cpp [new file with mode: 0644]
context.h [new file with mode: 0644]
glwidget.cpp [new file with mode: 0644]
glwidget.h [new file with mode: 0644]
h264encode.cpp [new file with mode: 0644]
h264encode.h [new file with mode: 0644]
main.cpp [new file with mode: 0644]
mainwindow.cpp [new file with mode: 0644]
mainwindow.h [new file with mode: 0644]
mixer.cpp [new file with mode: 0644]
mixer.h [new file with mode: 0644]
pbo_frame_allocator.cpp [new file with mode: 0644]
pbo_frame_allocator.h [new file with mode: 0644]
vs-cbcr.130.vert [new file with mode: 0644]
window.cpp [new file with mode: 0644]
window.h [new file with mode: 0644]

diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..9958450
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+CXX=g++
+CXXFLAGS := -O2 -march=native -g -std=gnu++11 -Wall -Wno-deprecated-declarations -fPIC $(shell pkg-config --cflags Qt5Core Qt5Widgets Qt5OpenGLExtensions libusb-1.0 movit) -pthread
+LDFLAGS=$(shell pkg-config --libs Qt5Core Qt5Widgets Qt5OpenGLExtensions libusb-1.0 movit) -lEGL -lGL -pthread -lva -lva-drm -lva-x11 -lX11 -lavformat -lavcodec -lavutil
+
+# Qt objects
+OBJS=glwidget.o main.o mainwindow.o window.o
+OBJS += glwidget.moc.o mainwindow.moc.o window.moc.o
+
+# Mixer objects
+OBJS += h264encode.o mixer.o bmusb.o pbo_frame_allocator.o context.o
+
+%.moc.cpp: %.h
+       moc $< -o $@
+
+all: nageru
+
+nageru: $(OBJS)
+       $(CXX) $(LDFLAGS) -o $@ $^
+
+
diff --git a/bmusb.cpp b/bmusb.cpp
new file mode 100644 (file)
index 0000000..207d632
--- /dev/null
+++ b/bmusb.cpp
@@ -0,0 +1,939 @@
+// TODO: Replace with linking to upstream bmusb.
+
+// Intensity Shuttle USB3 prototype capture driver, v0.3
+// Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
+// (can do captures for hours at a time with no drops), except during startup
+// 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
+// Audio comes out as 8-channel 24-bit raw audio.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <libusb.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <assert.h>
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <deque>
+#include <utility>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <stack>
+#include <atomic>
+#include "bmusb.h"
+
+using namespace std;
+using namespace std::placeholders;
+
+#define WIDTH 1280
+#define HEIGHT 750  /* 30 lines ancillary data? */
+//#define WIDTH 1920
+//#define HEIGHT 1125  /* ??? lines ancillary data? */
+#define HEADER_SIZE 44
+//#define HEADER_SIZE 0
+#define AUDIO_HEADER_SIZE 4
+
+//#define FRAME_SIZE (WIDTH * HEIGHT * 2 + HEADER_SIZE)  // UYVY
+//#define FRAME_SIZE (WIDTH * HEIGHT * 2 * 4 / 3 + HEADER_SIZE)  // v210
+#define FRAME_SIZE (8 << 20)
+
+FILE *audiofp;
+
+thread usb_thread;
+atomic<bool> should_quit;
+
+FrameAllocator::~FrameAllocator() {}
+
+#define NUM_QUEUED_FRAMES 16
+class MallocFrameAllocator : public FrameAllocator {
+public:
+       MallocFrameAllocator(size_t frame_size);
+       Frame alloc_frame() override;
+       void release_frame(Frame frame) override;
+
+private:
+       size_t frame_size;
+
+       mutex freelist_mutex;
+       stack<unique_ptr<uint8_t[]>> freelist;  // All of size <frame_size>.
+};
+
+MallocFrameAllocator::MallocFrameAllocator(size_t frame_size)
+       : frame_size(frame_size)
+{
+       for (int i = 0; i < NUM_QUEUED_FRAMES; ++i) {
+               freelist.push(unique_ptr<uint8_t[]>(new uint8_t[frame_size]));
+       }
+}
+
+FrameAllocator::Frame MallocFrameAllocator::alloc_frame()
+{
+       Frame vf;
+       vf.owner = this;
+
+       unique_lock<mutex> lock(freelist_mutex);  // Meh.
+       if (freelist.empty()) {
+               printf("Frame overrun (no more spare frames of size %ld), dropping frame!\n",
+                       frame_size);
+       } else {
+               vf.data = freelist.top().release();
+               vf.size = frame_size;
+               freelist.pop();  // Meh.
+       }
+       return vf;
+}
+
+void MallocFrameAllocator::release_frame(Frame frame)
+{
+       unique_lock<mutex> lock(freelist_mutex);
+       freelist.push(unique_ptr<uint8_t[]>(frame.data));
+}
+
+bool uint16_less_than_with_wraparound(uint16_t a, uint16_t b)
+{
+       if (a == b) {
+               return false;
+       } else if (a < b) {
+               return (b - a < 0x8000);
+       } else {
+               int wrap_b = 0x10000 + int(b);
+               return (wrap_b - a < 0x8000);
+       }
+}
+
+void BMUSBCapture::queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, deque<QueuedFrame> *q)
+{
+       if (!q->empty() && !uint16_less_than_with_wraparound(q->back().timecode, timecode)) {
+               printf("Blocks going backwards: prev=0x%04x, cur=0x%04x (dropped)\n",
+                       q->back().timecode, timecode);
+               frame.owner->release_frame(frame);
+               return;
+       }
+
+       QueuedFrame qf;
+       qf.format = format;
+       qf.timecode = timecode;
+       qf.frame = frame;
+
+       {
+               unique_lock<mutex> lock(queue_lock);
+               q->push_back(move(qf));
+       }
+       queues_not_empty.notify_one();  // might be spurious
+}
+
+void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
+{
+       FILE *fp = fopen(filename, "wb");
+       if (fwrite(frame_start + HEADER_SIZE, frame_len - HEADER_SIZE, 1, fp) != 1) {
+               printf("short write!\n");
+       }
+       fclose(fp);
+}
+
+void dump_audio_block(uint8_t *audio_start, size_t audio_len)
+{
+       fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
+}
+
+void BMUSBCapture::dequeue_thread()
+{
+       for ( ;; ) {
+               unique_lock<mutex> lock(queue_lock);
+               queues_not_empty.wait(lock, [this]{ return !pending_video_frames.empty() && !pending_audio_frames.empty(); });
+
+               uint16_t video_timecode = pending_video_frames.front().timecode;
+               uint16_t audio_timecode = pending_audio_frames.front().timecode;
+               if (video_timecode < audio_timecode) {
+                       printf("Video block 0x%04x without corresponding audio block, dropping.\n",
+                               video_timecode);
+                       video_frame_allocator->release_frame(pending_video_frames.front().frame);
+                       pending_video_frames.pop_front();
+               } else if (audio_timecode < video_timecode) {
+                       printf("Audio block 0x%04x without corresponding video block, dropping.\n",
+                               audio_timecode);
+                       audio_frame_allocator->release_frame(pending_audio_frames.front().frame);
+                       pending_audio_frames.pop_front();
+               } else {
+                       QueuedFrame video_frame = pending_video_frames.front();
+                       QueuedFrame audio_frame = pending_audio_frames.front();
+                       pending_audio_frames.pop_front();
+                       pending_video_frames.pop_front();
+                       lock.unlock();
+
+#if 0
+                       char filename[255];
+                       snprintf(filename, sizeof(filename), "%04x%04x.uyvy", video_frame.format, video_timecode);
+                       dump_frame(filename, video_frame.frame.data, video_frame.data_len);
+                       dump_audio_block(audio_frame.frame.data, audio_frame.data_len); 
+#endif
+
+                       frame_callback(video_timecode,
+                                      video_frame.frame, HEADER_SIZE, video_frame.format,
+                                      audio_frame.frame, AUDIO_HEADER_SIZE, audio_frame.format);
+               }
+       }
+}
+
+void BMUSBCapture::start_new_frame(const uint8_t *start)
+{
+       uint16_t format = (start[3] << 8) | start[2];
+       uint16_t timecode = (start[1] << 8) | start[0];
+
+       if (current_video_frame.len > 0) {
+               //dump_frame();
+               queue_frame(format, timecode, current_video_frame, &pending_video_frames);
+       }
+       //printf("Found frame start, format 0x%04x timecode 0x%04x, previous frame length was %d/%d\n",
+       //      format, timecode,
+       //      //start[7], start[6], start[5], start[4],
+       //      read_current_frame, FRAME_SIZE);
+
+       current_video_frame = video_frame_allocator->alloc_frame();
+       //if (current_video_frame.data == nullptr) {
+       //      read_current_frame = -1;
+       //} else {
+       //      read_current_frame = 0;
+       //}
+}
+
+void BMUSBCapture::start_new_audio_block(const uint8_t *start)
+{
+       uint16_t format = (start[3] << 8) | start[2];
+       uint16_t timecode = (start[1] << 8) | start[0];
+       if (current_audio_frame.len > 0) {
+               //dump_audio_block();
+               queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
+       }
+       //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
+       //      format, timecode, read_current_audio_block);
+       current_audio_frame = audio_frame_allocator->alloc_frame();
+}
+
+#if 0
+static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_packet_descriptor *pack)
+{
+       //      printf("ISO pack%u length:%u, actual_length:%u, offset:%u\n", i, pack->length, pack->actual_length, offset);
+       for (unsigned j = 0; j < pack->actual_length; j++) {
+       //for (int j = 0; j < min(pack->actual_length, 16u); j++) {
+               printf("%02x", xfr->buffer[j + offset]);
+               if ((j % 16) == 15)
+                       printf("\n");
+               else if ((j % 8) == 7)
+                       printf("  ");
+               else
+                       printf(" ");
+       }
+}
+#endif
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       assert(n % 2 == 0);
+       uint8_t *dptr1 = dest1;
+       uint8_t *dptr2 = dest2;
+
+       for (size_t i = 0; i < n; i += 2) {
+               *dptr1++ = *src++;
+               *dptr2++ = *src++;
+       }
+}
+
+void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
+{
+       if (current_frame->data == nullptr ||
+           current_frame->len > current_frame->size ||
+           start == end) {
+               return;
+       }
+
+       int bytes = end - start;
+       if (current_frame->len + bytes > current_frame->size) {
+               printf("%d bytes overflow after last %s frame\n",
+                       int(current_frame->len + bytes - current_frame->size), frame_type_name);
+               //dump_frame();
+       } else {
+               if (current_frame->interleaved) {
+                       uint8_t *data = current_frame->data + current_frame->len / 2;
+                       uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
+                       if (current_frame->len % 2 == 1) {
+                               ++data;
+                               swap(data, data2);
+                       }
+                       if (bytes % 2 == 1) {
+                               *data++ = *start++;
+                               swap(data, data2);
+                               ++current_frame->len;
+                               --bytes;
+                       }
+                       memcpy_interleaved(data, data2, start, bytes);
+                       current_frame->len += bytes;
+               } else {
+                       memcpy(current_frame->data + current_frame->len, start, bytes);
+                       current_frame->len += bytes;
+               }
+       }
+}
+
+#ifdef __SSE2__
+
+#if 0
+void dump(const char *name, __m256i n)
+{
+       printf("%-10s:", name);
+       printf(" %02x", _mm256_extract_epi8(n, 0));
+       printf(" %02x", _mm256_extract_epi8(n, 1));
+       printf(" %02x", _mm256_extract_epi8(n, 2));
+       printf(" %02x", _mm256_extract_epi8(n, 3));
+       printf(" %02x", _mm256_extract_epi8(n, 4));
+       printf(" %02x", _mm256_extract_epi8(n, 5));
+       printf(" %02x", _mm256_extract_epi8(n, 6));
+       printf(" %02x", _mm256_extract_epi8(n, 7));
+       printf(" ");
+       printf(" %02x", _mm256_extract_epi8(n, 8));
+       printf(" %02x", _mm256_extract_epi8(n, 9));
+       printf(" %02x", _mm256_extract_epi8(n, 10));
+       printf(" %02x", _mm256_extract_epi8(n, 11));
+       printf(" %02x", _mm256_extract_epi8(n, 12));
+       printf(" %02x", _mm256_extract_epi8(n, 13));
+       printf(" %02x", _mm256_extract_epi8(n, 14));
+       printf(" %02x", _mm256_extract_epi8(n, 15));
+       printf(" ");
+       printf(" %02x", _mm256_extract_epi8(n, 16));
+       printf(" %02x", _mm256_extract_epi8(n, 17));
+       printf(" %02x", _mm256_extract_epi8(n, 18));
+       printf(" %02x", _mm256_extract_epi8(n, 19));
+       printf(" %02x", _mm256_extract_epi8(n, 20));
+       printf(" %02x", _mm256_extract_epi8(n, 21));
+       printf(" %02x", _mm256_extract_epi8(n, 22));
+       printf(" %02x", _mm256_extract_epi8(n, 23));
+       printf(" ");
+       printf(" %02x", _mm256_extract_epi8(n, 24));
+       printf(" %02x", _mm256_extract_epi8(n, 25));
+       printf(" %02x", _mm256_extract_epi8(n, 26));
+       printf(" %02x", _mm256_extract_epi8(n, 27));
+       printf(" %02x", _mm256_extract_epi8(n, 28));
+       printf(" %02x", _mm256_extract_epi8(n, 29));
+       printf(" %02x", _mm256_extract_epi8(n, 30));
+       printf(" %02x", _mm256_extract_epi8(n, 31));
+       printf("\n");
+}
+#endif
+
+// Does a memcpy and memchr in one to reduce processing time.
+// Note that the benefit is somewhat limited if your L3 cache is small,
+// as you'll (unfortunately) spend most of the time loading the data
+// from main memory.
+//
+// Complicated cases are left to the slow path; it basically stops copying
+// up until the first instance of "sync_char" (usually a bit before, actually).
+// This is fine, since 0x00 bytes shouldn't really show up in normal picture
+// data, and what we really need this for is the 00 00 ff ff marker in video data.
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+       if (current_frame->data == nullptr ||
+           current_frame->len > current_frame->size ||
+           start == limit) {
+               return start;
+       }
+       size_t orig_bytes = limit - start;
+       if (orig_bytes < 128) {
+               // Don't bother.
+               return start;
+       }
+
+       // Don't read more bytes than we can write.
+       limit = min(limit, start + (current_frame->size - current_frame->len));
+
+       // Align end to 32 bytes.
+       limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+       if (start >= limit) {
+               return start;
+       }
+
+       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+       const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
+       if (aligned_start != start) {
+               const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
+               if (sync_start == nullptr) {
+                       add_to_frame(current_frame, "", start, aligned_start);
+               } else {
+                       add_to_frame(current_frame, "", start, sync_start);
+                       return sync_start;
+               }
+       }
+
+       // Make the length a multiple of 64.
+       if (current_frame->interleaved) {
+               if (((limit - aligned_start) % 64) != 0) {
+                       limit -= 32;
+               }
+               assert(((limit - aligned_start) % 64) == 0);
+       }
+
+#if __AVX2__
+       const __m256i needle = _mm256_set1_epi8(sync_char);
+
+       const __restrict __m256i *in = (const __m256i *)aligned_start;
+       if (current_frame->interleaved) {
+               __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+               if (current_frame->len % 2 == 1) {
+                       swap(out1, out2);
+               }
+
+               __m256i shuffle_cw = _mm256_set_epi8(
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+               while (in < (const __m256i *)limit) {
+                       // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+                       __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+                       __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
+                       __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
+                       __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+                       __m256i found = _mm256_or_si256(found1, found2);
+
+                       data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+                       data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+               
+                       data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+                       data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+                       __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+                       __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       _mm256_storeu_si256(out2, hi);
+
+                       if (!_mm256_testz_si256(found, found)) {
+                               break;
+                       }
+
+                       in += 2;
+                       ++out1;
+                       ++out2;
+               }
+               current_frame->len += (uint8_t *)in - aligned_start;
+       } else {
+               __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+               while (in < (const __m256i *)limit) {
+                       __m256i data = _mm256_load_si256(in);
+                       _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
+                       __m256i found = _mm256_cmpeq_epi8(data, needle);
+                       if (!_mm256_testz_si256(found, found)) {
+                               break;
+                       }
+
+                       ++in;
+                       ++out;
+               }
+               current_frame->len = (uint8_t *)out - current_frame->data;
+       }
+#else
+       const __m128i needle = _mm_set1_epi8(sync_char);
+
+       const __m128i *in = (const __m128i *)aligned_start;
+       if (current_frame->interleaved) {
+               __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
+               if (current_frame->len % 2 == 1) {
+                       swap(out1, out2);
+               }
+
+               __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+               while (in < (const __m128i *)limit) {
+                       __m128i data1 = _mm_load_si128(in);
+                       __m128i data2 = _mm_load_si128(in + 1);
+                       __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+                       __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+                       __m128i data1_hi = _mm_srli_epi16(data1, 8);
+                       __m128i data2_hi = _mm_srli_epi16(data2, 8);
+                       __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+                       _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+                       _mm_storeu_si128(out2, hi);
+                       __m128i found1 = _mm_cmpeq_epi8(data1, needle);
+                       __m128i found2 = _mm_cmpeq_epi8(data2, needle);
+                       if (!_mm_testz_si128(found1, found1) ||
+                           !_mm_testz_si128(found2, found2)) {
+                               break;
+                       }
+
+                       in += 2;
+                       ++out1;
+                       ++out2;
+               }
+               current_frame->len += (uint8_t *)in - aligned_start;
+       } else {
+               __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+               while (in < (const __m128i *)limit) {
+                       __m128i data = _mm_load_si128(in);
+                       _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
+                       __m128i found = _mm_cmpeq_epi8(data, needle);
+                       if (!_mm_testz_si128(found, found)) {
+                               break;
+                       }
+
+                       ++in;
+                       ++out;
+               }
+               current_frame->len = (uint8_t *)out - current_frame->data;
+       }
+#endif
+
+       //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+
+       return (const uint8_t *)in;
+}
+#endif
+
+void decode_packs(const libusb_transfer *xfr,
+                  const char *sync_pattern,
+                  int sync_length,
+                  FrameAllocator::Frame *current_frame,
+                  const char *frame_type_name,
+                  function<void(const uint8_t *start)> start_callback)
+{
+       int offset = 0;
+       for (int i = 0; i < xfr->num_iso_packets; i++) {
+               const libusb_iso_packet_descriptor *pack = &xfr->iso_packet_desc[i];
+
+               if (pack->status != LIBUSB_TRANSFER_COMPLETED) {
+                       fprintf(stderr, "Error: pack %u/%u status %d\n", i, xfr->num_iso_packets, pack->status);
+                       continue;
+//exit(5);
+               }
+
+               const uint8_t *start = xfr->buffer + offset;
+               const uint8_t *limit = start + pack->actual_length;
+               while (start < limit) {  // Usually runs only one iteration.
+#ifdef __SSE2__
+                       start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
+                       if (start == limit) break;
+                       assert(start < limit);
+#endif
+
+                       const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
+                       if (start_next_frame == nullptr) {
+                               // add the rest of the buffer
+                               add_to_frame(current_frame, frame_type_name, start, limit);
+                               break;
+                       } else {
+                               add_to_frame(current_frame, frame_type_name, start, start_next_frame);
+                               start = start_next_frame + sync_length;  // skip sync
+                               start_callback(start);
+                       }
+               }
+#if 0
+               dump_pack(xfr, offset, pack);
+#endif
+               offset += pack->length;
+       }
+}
+
+void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
+{
+       if (xfr->status != LIBUSB_TRANSFER_COMPLETED) {
+               fprintf(stderr, "transfer status %d\n", xfr->status);
+               libusb_free_transfer(xfr);
+               exit(3);
+       }
+
+       assert(xfr->user_data != nullptr);
+       BMUSBCapture *usb = static_cast<BMUSBCapture *>(xfr->user_data);
+
+       if (xfr->type == LIBUSB_TRANSFER_TYPE_ISOCHRONOUS) {
+               if (xfr->endpoint == 0x84) {
+                       decode_packs(xfr, "DeckLinkAudioResyncT", 20, &usb->current_audio_frame, "audio", bind(&BMUSBCapture::start_new_audio_block, usb, _1));
+               } else {
+                       decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
+               }
+       }
+       if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
+               //const libusb_control_setup *setup = libusb_control_transfer_get_setup(xfr);
+               uint8_t *buf = libusb_control_transfer_get_data(xfr);
+#if 0
+               if (setup->wIndex == 44) {
+                       printf("read timer register: 0x%02x%02x%02x%02x\n", buf[0], buf[1], buf[2], buf[3]);
+               } else {
+                       printf("read register %2d:                      0x%02x%02x%02x%02x\n",
+                               setup->wIndex, buf[0], buf[1], buf[2], buf[3]);
+               }
+#else
+               memcpy(usb->register_file + usb->current_register, buf, 4);
+               usb->current_register = (usb->current_register + 4) % NUM_BMUSB_REGISTERS;
+               if (usb->current_register == 0) {
+                       // read through all of them
+                       printf("register dump:");
+                       for (int i = 0; i < NUM_BMUSB_REGISTERS; i += 4) {
+                               printf(" 0x%02x%02x%02x%02x", usb->register_file[i], usb->register_file[i + 1], usb->register_file[i + 2], usb->register_file[i + 3]);
+                       }
+                       printf("\n");
+               }
+               libusb_fill_control_setup(xfr->buffer,
+                   LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
+                       /*index=*/usb->current_register, /*length=*/4);
+#endif
+       }
+
+#if 0
+       printf("length:%u, actual_length:%u\n", xfr->length, xfr->actual_length);
+       for (i = 0; i < xfr->actual_length; i++) {
+               printf("%02x", xfr->buffer[i]);
+               if (i % 16)
+                       printf("\n");
+               else if (i % 8)
+                       printf("  ");
+               else
+                       printf(" ");
+       }
+#endif
+
+       if (libusb_submit_transfer(xfr) < 0) {
+               fprintf(stderr, "error re-submitting URB\n");
+               exit(1);
+       }
+}
+
+void BMUSBCapture::usb_thread_func()
+{
+       sched_param param;
+       memset(&param, 0, sizeof(param));
+       param.sched_priority = 1;
+       if (sched_setscheduler(0, SCHED_RR, &param) == -1) {
+               printf("couldn't set realtime priority for USB thread: %s\n", strerror(errno));
+       }
+       while (!should_quit) {
+               int rc = libusb_handle_events(nullptr);
+               if (rc != LIBUSB_SUCCESS)
+                       break;
+       }
+}
+
+void BMUSBCapture::configure_card()
+{
+       if (video_frame_allocator == nullptr) {
+               set_video_frame_allocator(new MallocFrameAllocator(FRAME_SIZE));  // FIXME: leak.
+       }
+       if (audio_frame_allocator == nullptr) {
+               set_audio_frame_allocator(new MallocFrameAllocator(65536));  // FIXME: leak.
+       }
+       thread(&BMUSBCapture::dequeue_thread, this).detach();
+
+       int rc;
+       struct libusb_transfer *xfr;
+
+       rc = libusb_init(nullptr);
+       if (rc < 0) {
+               fprintf(stderr, "Error initializing libusb: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+
+       //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd3b);
+       //struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, 0x1edb, 0xbd4f);
+       struct libusb_device_handle *devh = libusb_open_device_with_vid_pid(nullptr, vid, pid);
+       if (!devh) {
+               fprintf(stderr, "Error finding USB device\n");
+               exit(1);
+       }
+
+       libusb_config_descriptor *config;
+       rc = libusb_get_config_descriptor(libusb_get_device(devh), /*config_index=*/0, &config);
+       if (rc < 0) {
+               fprintf(stderr, "Error getting configuration: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+       printf("%d interface\n", config->bNumInterfaces);
+       for (int interface_number = 0; interface_number < config->bNumInterfaces; ++interface_number) {
+               printf("  interface %d\n", interface_number);
+               const libusb_interface *interface = &config->interface[interface_number];
+               for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
+                       const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
+                       printf("    alternate setting %d\n", interface_desc->bAlternateSetting);
+                       for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
+                               const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
+                               printf("        endpoint address 0x%02x\n", endpoint->bEndpointAddress);
+                       }
+               }
+       }
+
+#if 0
+       rc = libusb_set_configuration(devh, /*configuration=*/1);
+       if (rc < 0) {
+               fprintf(stderr, "Error setting configuration 1: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+#endif
+
+       rc = libusb_claim_interface(devh, 0);
+       if (rc < 0) {
+               fprintf(stderr, "Error claiming interface 0: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+
+#if 0
+       rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
+       if (rc < 0) {
+               fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+#endif
+
+#if 0
+       rc = libusb_claim_interface(devh, 3);
+       if (rc < 0) {
+               fprintf(stderr, "Error claiming interface 3: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+#endif
+
+       // theories:
+       //   44 is some kind of timer register (first 16 bits count upwards)
+       //   24 is some sort of watchdog?
+       //      you can seemingly set it to 0x73c60001 and that bit will eventually disappear
+       //      (or will go to 0x73c60010?), also seen 0x73c60100
+       //   12 also changes all the time, unclear why  
+       //   16 seems to be autodetected mode somehow
+       //      --    this is e00115e0 after reset?
+       //                    ed0115e0 after mode change [to output?]
+       //                    2d0015e0 after more mode change [to input]
+       //                    ed0115e0 after more mode change
+       //                    2d0015e0 after more mode change
+       //
+       //                    390115e0 seems to indicate we have signal
+       //         changes to 200115e0 when resolution changes/we lose signal, driver resets after a while
+       //
+       //                    200015e0 on startup
+       //         changes to 250115e0 when we sync to the signal
+       //
+       //    so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
+       //
+       //    28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
+       //    however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
+       //
+       //    4, 8, 20 are unclear. seem to be some sort of bitmask, but we can set them to 0 with no apparent effect.
+       //    perhaps some of them are related to analog output?
+       //
+       //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
+       //    but the driver sets it to 0x8036802a at some point.
+       //
+       // register 16:
+       // first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
+       //
+       // theories:
+       //   0x01 - stable signal
+       //   0x04 - deep color
+       //   0x08 - unknown (audio??)
+       //   0x20 - 720p??
+       //   0x30 - 576p??
+
+       struct ctrl {
+               int endpoint;
+               int request;
+               int index;
+               uint32_t data;
+       };
+       static const ctrl ctrls[] = {
+               { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
+               { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
+
+               // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
+               // capture (v210).
+               // clearing the 0x08000000 bit seems to change the capture format (other source?)
+               // 0x10000000 = analog audio instead of embedded audio, it seems
+               // 0x3a000000 = component video? (analog audio)
+               // 0x3c000000 = composite video? (analog audio)
+               // 0x3e000000 = s-video? (analog audio)
+               { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
+               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
+               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
+               { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
+               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
+       };
+
+       for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {
+               uint32_t flipped = htonl(ctrls[req].data);
+               static uint8_t value[4];
+               memcpy(value, &flipped, sizeof(flipped));
+               int size = sizeof(value);
+               //if (ctrls[req].request == 215) size = 0;
+               rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | ctrls[req].endpoint,
+                       /*request=*/ctrls[req].request, /*value=*/0, /*index=*/ctrls[req].index, value, size, /*timeout=*/0);
+               if (rc < 0) {
+                       fprintf(stderr, "Error on control %d: %s\n", ctrls[req].index, libusb_error_name(rc));
+                       exit(1);
+               }
+               
+               printf("rc=%d: ep=%d@%d %d -> 0x", rc, ctrls[req].endpoint, ctrls[req].request, ctrls[req].index);
+               for (int i = 0; i < rc; ++i) {
+                       printf("%02x", value[i]);
+               }
+               printf("\n");
+       }
+
+       // Alternate setting 1 is output, alternate setting 2 is input.
+       // Card is reset when switching alternates, so the driver uses
+       // this “double switch” when it wants to reset.
+#if 0
+       rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/1);
+       if (rc < 0) {
+               fprintf(stderr, "Error setting alternate 1: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+#endif
+       rc = libusb_set_interface_alt_setting(devh, /*interface=*/0, /*alternate_setting=*/4);
+       if (rc < 0) {
+               fprintf(stderr, "Error setting alternate 4: %s\n", libusb_error_name(rc));
+               exit(1);
+       }
+
+#if 0
+       // DEBUG
+       for ( ;; ) {
+               static int my_index = 0;
+               static uint8_t value[4];
+               int size = sizeof(value);
+               rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN,
+                       /*request=*/214, /*value=*/0, /*index=*/my_index, value, size, /*timeout=*/0);
+               if (rc < 0) {
+                       fprintf(stderr, "Error on control\n");
+                       exit(1);
+               }
+               printf("rc=%d index=%d: 0x", rc, my_index);
+               for (int i = 0; i < rc; ++i) {
+                       printf("%02x", value[i]);
+               }
+               printf("\n");
+       }
+#endif
+
+#if 0
+       // set up an asynchronous transfer of the timer register
+       static uint8_t cmdbuf[LIBUSB_CONTROL_SETUP_SIZE + 4];
+       static int completed = 0;
+
+       xfr = libusb_alloc_transfer(0);
+       libusb_fill_control_setup(cmdbuf,
+           LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
+               /*index=*/44, /*length=*/4);
+       libusb_fill_control_transfer(xfr, devh, cmdbuf, cb_xfr, &completed, 0);
+       xfr->user_data = this;
+       libusb_submit_transfer(xfr);
+
+       // set up an asynchronous transfer of register 24
+       static uint8_t cmdbuf2[LIBUSB_CONTROL_SETUP_SIZE + 4];
+       static int completed2 = 0;
+
+       xfr = libusb_alloc_transfer(0);
+       libusb_fill_control_setup(cmdbuf2,
+           LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
+               /*index=*/24, /*length=*/4);
+       libusb_fill_control_transfer(xfr, devh, cmdbuf2, cb_xfr, &completed2, 0);
+       xfr->user_data = this;
+       libusb_submit_transfer(xfr);
+#endif
+
+       // set up an asynchronous transfer of the register dump
+       static uint8_t cmdbuf3[LIBUSB_CONTROL_SETUP_SIZE + 4];
+       static int completed3 = 0;
+
+       xfr = libusb_alloc_transfer(0);
+       libusb_fill_control_setup(cmdbuf3,
+           LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_IN, /*request=*/214, /*value=*/0,
+               /*index=*/current_register, /*length=*/4);
+       libusb_fill_control_transfer(xfr, devh, cmdbuf3, cb_xfr, &completed3, 0);
+       xfr->user_data = this;
+       //libusb_submit_transfer(xfr);
+
+       audiofp = fopen("audio.raw", "wb");
+
+       // set up isochronous transfers for audio and video
+       for (int e = 3; e <= 4; ++e) {
+               //int num_transfers = (e == 3) ? 6 : 6;
+               int num_transfers = 2;
+               for (int i = 0; i < num_transfers; ++i) {
+                       int num_iso_pack, size;
+                       if (e == 3) {
+                               // Video seems to require isochronous packets scaled with the width; 
+                               // seemingly six lines is about right, rounded up to the required 1kB
+                               // multiple.
+                               size = WIDTH * 2 * 6;
+                               // Note that for 10-bit input, you'll need to increase size accordingly.
+                               //size = size * 4 / 3;
+                               if (size % 1024 != 0) {
+                                       size &= ~1023;
+                                       size += 1024;
+                               }
+                               num_iso_pack = (2 << 20) / size;  // 512 kB.
+                               printf("Picking %d packets of 0x%x bytes each\n", num_iso_pack, size);
+                       } else {
+                               size = 0xc0;
+                               num_iso_pack = 80;
+                       }
+                       int num_bytes = num_iso_pack * size;
+                       uint8_t *buf = new uint8_t[num_bytes];
+
+                       xfr = libusb_alloc_transfer(num_iso_pack);
+                       if (!xfr) {
+                               fprintf(stderr, "oom\n");
+                               exit(1);
+                       }
+
+                       int ep = LIBUSB_ENDPOINT_IN | e;
+                       libusb_fill_iso_transfer(xfr, devh, ep, buf, num_bytes,
+                               num_iso_pack, cb_xfr, nullptr, 0);
+                       libusb_set_iso_packet_lengths(xfr, size);
+                       xfr->user_data = this;
+                       iso_xfrs.push_back(xfr);
+               }
+       }
+}
+
+void BMUSBCapture::start_bm_capture()
+{
+       printf("starting capture\n");
+       int i = 0;
+       for (libusb_transfer *xfr : iso_xfrs) {
+               printf("submitting transfer...\n");
+               int rc = libusb_submit_transfer(xfr);
+               ++i;
+               if (rc < 0) {
+                       //printf("num_bytes=%d\n", num_bytes);
+                       fprintf(stderr, "Error submitting iso to endpoint 0x%02x, number %d: %s\n",
+                               xfr->endpoint, i, libusb_error_name(rc));
+                       exit(1);
+               }
+       }
+
+
+#if 0
+       libusb_release_interface(devh, 0);
+out:
+       if (devh)
+               libusb_close(devh);
+       libusb_exit(nullptr);
+       return rc;
+#endif
+}
+
+void BMUSBCapture::start_bm_thread()
+{
+       should_quit = false;
+       usb_thread = thread(&BMUSBCapture::usb_thread_func);
+}
+
+void BMUSBCapture::stop_bm_thread()
+{
+       should_quit = true;
+       usb_thread.join();
+}
diff --git a/bmusb.h b/bmusb.h
new file mode 100644 (file)
index 0000000..c881748
--- /dev/null
+++ b/bmusb.h
@@ -0,0 +1,138 @@
+#ifndef _BMUSB_H
+#define _BMUSB_H
+
+#include <stdint.h>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <mutex>
+#include <thread>
+
+// An interface for frame allocators; if you do not specify one
+// (using set_video_frame_allocator), a default one that pre-allocates
+// a freelist of eight frames using new[] will be used. Specifying
+// your own can be useful if you have special demands for where you want the
+// frame to end up and don't want to spend the extra copy to get it there, for
+// instance GPU memory.
+class FrameAllocator {
+ public:
+       struct Frame {
+               uint8_t *data = nullptr;
+               uint8_t *data2 = nullptr;  // Only if interleaved == true.
+               size_t len = 0;  // Number of bytes we actually have.
+               size_t size = 0;  // Number of bytes we have room for.
+               void *userdata = nullptr;
+               FrameAllocator *owner = nullptr;
+
+               // If set to true, every other byte will go to data and to data2.
+               // If so, <len> and <size> are still about the number of total bytes
+               // so if size == 1024, there's 512 bytes in data and 512 in data2.
+               bool interleaved = false;
+       };
+
+       virtual ~FrameAllocator();
+
+       // Request a video frame. Note that this is called from the
+       // USB thread, which runs with realtime priority and is
+       // very sensitive to delays. Thus, you should not do anything
+       // here that might sleep, including calling malloc().
+       // (Taking a mutex is borderline.)
+       //
+       // The Frame object will be given to the frame callback,
+       // which is responsible for releasing the video frame back
+       // once it is usable for new frames (ie., it will no longer
+       // be read from). You can use the "userdata" pointer for
+       // whatever you want to identify this frame if you need to.
+       //
+       // Returning a Frame with data==nullptr is allowed;
+       // if so, the frame in progress will be dropped.
+       virtual Frame alloc_frame() = 0;
+
+       virtual void release_frame(Frame frame) = 0;
+};
+
+typedef std::function<void(uint16_t timecode,
+                           FrameAllocator::Frame video_frame, size_t video_offset, uint16_t video_format,
+                           FrameAllocator::Frame audio_frame, size_t audio_offset, uint16_t audio_format)>
+       frame_callback_t;
+
+// The actual capturing class, representing capture from a single card.
+class BMUSBCapture {
+ public:
+       BMUSBCapture(int vid = 0x1edb, int pid = 0xbd3b)
+               : vid(vid), pid(pid)
+       {
+       }
+
+       // Does not take ownership.
+       void set_video_frame_allocator(FrameAllocator *allocator)
+       {
+               video_frame_allocator = allocator;
+       }
+
+       FrameAllocator *get_video_frame_allocator()
+       {
+               return video_frame_allocator;
+       }
+
+       // Does not take ownership.
+       void set_audio_frame_allocator(FrameAllocator *allocator)
+       {
+               audio_frame_allocator = allocator;
+       }
+
+       FrameAllocator *get_audio_frame_allocator()
+       {
+               return audio_frame_allocator;
+       }
+
+       void set_frame_callback(frame_callback_t callback)
+       {
+               frame_callback = callback;
+       }
+
+       void configure_card();
+       void start_bm_capture();
+
+       static void start_bm_thread();
+       static void stop_bm_thread();
+
+ private:
+       struct QueuedFrame {
+               uint16_t timecode;
+               uint16_t format;
+               FrameAllocator::Frame frame;
+       };
+
+       void start_new_audio_block(const uint8_t *start);
+       void start_new_frame(const uint8_t *start);
+
+       void queue_frame(uint16_t format, uint16_t timecode, FrameAllocator::Frame frame, std::deque<QueuedFrame> *q);
+       void dequeue_thread();
+
+       static void usb_thread_func();
+       static void cb_xfr(struct libusb_transfer *xfr);
+
+       FrameAllocator::Frame current_video_frame;
+       FrameAllocator::Frame current_audio_frame;
+
+       std::mutex queue_lock;
+       std::condition_variable queues_not_empty;
+       std::deque<QueuedFrame> pending_video_frames;
+       std::deque<QueuedFrame> pending_audio_frames;
+
+       FrameAllocator *video_frame_allocator = nullptr;
+       FrameAllocator *audio_frame_allocator = nullptr;
+       frame_callback_t frame_callback = nullptr;
+
+       int current_register = 0;
+
+       static constexpr int NUM_BMUSB_REGISTERS = 60;
+       uint8_t register_file[NUM_BMUSB_REGISTERS];
+
+       int vid, pid;
+       std::vector<libusb_transfer *> iso_xfrs;
+};
+
+#endif
diff --git a/context.cpp b/context.cpp
new file mode 100644 (file)
index 0000000..72275e9
--- /dev/null
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <QOpenGLContext>
+#include <QOffscreenSurface>
+#include <QWindow>
+
+QSurface *create_surface(const QSurfaceFormat &format)
+{
+       QOffscreenSurface *surface = new QOffscreenSurface;
+       surface->setFormat(format);
+//     QWindow *surface = new QWindow;
+       surface->create();
+       if (!surface->isValid()) {
+               printf("ERROR: surface not valid!\n");
+//             abort();
+       }
+       return surface;
+}
+
+QOpenGLContext *create_context()
+{
+       QOpenGLContext *context = new QOpenGLContext;
+       context->setShareContext(QOpenGLContext::globalShareContext());
+       context->create();
+       return context;
+}
+
+bool make_current(QOpenGLContext *context, QSurface *surface)
+{
+       return context->makeCurrent(surface);
+}
diff --git a/context.h b/context.h
new file mode 100644 (file)
index 0000000..5a75766
--- /dev/null
+++ b/context.h
@@ -0,0 +1,11 @@
+
+// Needs to be in its own file because Qt and libepoxy seemingly don't coexist well
+// within the same file.
+
+class QSurface;
+class QOpenGLContext;
+class QSurfaceFormat;
+
+QSurface *create_surface(const QSurfaceFormat &format);
+QOpenGLContext *create_context();
+bool make_current(QOpenGLContext *context, QSurface *surface);
diff --git a/glwidget.cpp b/glwidget.cpp
new file mode 100644 (file)
index 0000000..cc2b4fd
--- /dev/null
@@ -0,0 +1,49 @@
+#include "context.h"
+#include "glwidget.h"
+#include "mixer.h"
+#include <QCoreApplication>
+#include <QGuiApplication>
+#include <QThread>
+#include <math.h>
+#include <EGL/egl.h>
+#include <GL/glx.h>
+#include <thread>
+
+GLWidget::GLWidget(QWidget *parent)
+    : QOpenGLWidget(parent)
+{
+}
+
+GLWidget::~GLWidget()
+{
+}
+
+QSize GLWidget::minimumSizeHint() const
+{
+       return QSize(50, 50);
+}
+
+QSize GLWidget::sizeHint() const
+{
+       return QSize(400, 400);
+}
+
+void GLWidget::initializeGL()
+{
+       printf("egl=%p glx=%p\n", eglGetCurrentContext(), glXGetCurrentContext());
+       //printf("threads: %p %p\n", QThread::currentThread(), qGuiApp->thread());
+
+       QSurface *surface = create_surface(format());
+       QSurface *surface2 = create_surface(format());
+       QSurface *surface3 = create_surface(format());
+       QSurface *surface4 = create_surface(format());
+       std::thread([surface, surface2, surface3, surface4]{
+               mixer_thread(surface, surface2, surface3, surface4);
+       }).detach();
+}
+
+void GLWidget::paintGL()
+{
+       glClearColor(1.0f, 0.0f, 0.0f, 1.0f);
+       glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+}
diff --git a/glwidget.h b/glwidget.h
new file mode 100644 (file)
index 0000000..eb26d38
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef GLWIDGET_H
+#define GLWIDGET_H
+
+#include <QOpenGLWidget>
+#include <QOpenGLFunctions>
+#include <QOpenGLVertexArrayObject>
+#include <QOpenGLBuffer>
+#include <QMatrix4x4>
+
+class GLWidget : public QOpenGLWidget
+{
+       Q_OBJECT
+
+public:
+       GLWidget(QWidget *parent = 0);
+       ~GLWidget();
+
+       QSize minimumSizeHint() const Q_DECL_OVERRIDE;
+       QSize sizeHint() const Q_DECL_OVERRIDE;
+
+protected:
+       void initializeGL() Q_DECL_OVERRIDE;
+       void paintGL() Q_DECL_OVERRIDE;
+};
+
+#endif
diff --git a/h264encode.cpp b/h264encode.cpp
new file mode 100644 (file)
index 0000000..d4fc0ad
--- /dev/null
@@ -0,0 +1,1949 @@
+//#include "sysdeps.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <pthread.h>
+#include <errno.h>
+#include <math.h>
+#include <va/va.h>
+#include <va/va_x11.h>
+#include <va/va_enc_h264.h>
+#include <va/va_drmcommon.h>
+#include <libdrm/drm_fourcc.h>
+#include <thread>
+#include <mutex>
+#include <queue>
+#include <condition_variable>
+#include "h264encode.h"
+
+#define CHECK_VASTATUS(va_status, func)                                 \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        exit(1);                                                        \
+    }
+
+//#include "loadsurface.h"
+
+#define NAL_REF_IDC_NONE        0
+#define NAL_REF_IDC_LOW         1
+#define NAL_REF_IDC_MEDIUM      2
+#define NAL_REF_IDC_HIGH        3
+
+#define NAL_NON_IDR             1
+#define NAL_IDR                 5
+#define NAL_SPS                 7
+#define NAL_PPS                 8
+#define NAL_SEI                        6
+
+#define SLICE_TYPE_P            0
+#define SLICE_TYPE_B            1
+#define SLICE_TYPE_I            2
+#define IS_P_SLICE(type) (SLICE_TYPE_P == (type))
+#define IS_B_SLICE(type) (SLICE_TYPE_B == (type))
+#define IS_I_SLICE(type) (SLICE_TYPE_I == (type))
+
+
+#define ENTROPY_MODE_CAVLC      0
+#define ENTROPY_MODE_CABAC      1
+
+#define PROFILE_IDC_BASELINE    66
+#define PROFILE_IDC_MAIN        77
+#define PROFILE_IDC_HIGH        100
+   
+#define BITSTREAM_ALLOCATE_STEPPING     4096
+
+#define SURFACE_NUM 16 /* 16 surfaces for source YUV */
+static  VADisplay va_dpy;
+static  VAProfile h264_profile = (VAProfile)~0;
+static  VAConfigAttrib config_attrib[VAConfigAttribTypeMax];
+static  int config_attrib_num = 0, enc_packed_header_idx;
+
+struct GLSurface {
+       VASurfaceID src_surface, ref_surface;
+       VABufferID coded_buf;
+
+       VAImage surface_image;
+       GLuint y_tex, cbcr_tex;
+       EGLImage y_egl_image, cbcr_egl_image;
+};
+GLSurface gl_surfaces[SURFACE_NUM];
+
+static  VAConfigID config_id;
+static  VAContextID context_id;
+static  VAEncSequenceParameterBufferH264 seq_param;
+static  VAEncPictureParameterBufferH264 pic_param;
+static  VAEncSliceParameterBufferH264 slice_param;
+static  VAPictureH264 CurrentCurrPic;
+static  VAPictureH264 ReferenceFrames[16], RefPicList0_P[32], RefPicList0_B[32], RefPicList1_B[32];
+
+static  unsigned int MaxFrameNum = (2<<16);
+static  unsigned int MaxPicOrderCntLsb = (2<<8);
+static  unsigned int Log2MaxFrameNum = 16;
+static  unsigned int Log2MaxPicOrderCntLsb = 8;
+
+static  unsigned int num_ref_frames = 2;
+static  unsigned int numShortTerm = 0;
+static  int constraint_set_flag = 0;
+static  int h264_packedheader = 0; /* support pack header? */
+static  int h264_maxref = (1<<16|1);
+static  int h264_entropy_mode = 1; /* cabac */
+
+static  char *coded_fn = NULL;
+static  FILE *coded_fp = NULL;
+
+static  int frame_width = 176;
+static  int frame_height = 144;
+static  int frame_width_mbaligned;
+static  int frame_height_mbaligned;
+static  int frame_rate = 60;
+static  unsigned int frame_bitrate = 0;
+static  unsigned int frame_slices = 1;
+static  double frame_size = 0;
+static  int initial_qp = 15;
+//static  int initial_qp = 28;
+static  int minimal_qp = 0;
+static  int intra_period = 30;
+static  int intra_idr_period = 60;
+static  int ip_period = 1;
+static  int rc_mode = -1;
+static  int rc_default_modes[] = {
+    VA_RC_VBR,
+    VA_RC_CQP,
+    VA_RC_VBR_CONSTRAINED,
+    VA_RC_CBR,
+    VA_RC_VCM,
+    VA_RC_NONE,
+};
+static  unsigned long long current_frame_encoding = 0;
+static  unsigned long long current_frame_display = 0;
+static  unsigned long long current_IDR_display = 0;
+static  unsigned int current_frame_num = 0;
+static  int current_frame_type;
+
+static  int misc_priv_type = 0;
+static  int misc_priv_value = 0;
+
+/* thread to save coded data */
+#define SRC_SURFACE_FREE        0
+#define SRC_SURFACE_IN_ENCODING 1
+    
+struct __bitstream {
+    unsigned int *buffer;
+    int bit_offset;
+    int max_size_in_dword;
+};
+typedef struct __bitstream bitstream;
+
+using namespace std;
+
+static unsigned int 
+va_swap32(unsigned int val)
+{
+    unsigned char *pval = (unsigned char *)&val;
+
+    return ((pval[0] << 24)     |
+            (pval[1] << 16)     |
+            (pval[2] << 8)      |
+            (pval[3] << 0));
+}
+
+static void
+bitstream_start(bitstream *bs)
+{
+    bs->max_size_in_dword = BITSTREAM_ALLOCATE_STEPPING;
+    bs->buffer = (unsigned int *)calloc(bs->max_size_in_dword * sizeof(int), 1);
+    bs->bit_offset = 0;
+}
+
+static void
+bitstream_end(bitstream *bs)
+{
+    int pos = (bs->bit_offset >> 5);
+    int bit_offset = (bs->bit_offset & 0x1f);
+    int bit_left = 32 - bit_offset;
+
+    if (bit_offset) {
+        bs->buffer[pos] = va_swap32((bs->buffer[pos] << bit_left));
+    }
+}
+static void
+bitstream_put_ui(bitstream *bs, unsigned int val, int size_in_bits)
+{
+    int pos = (bs->bit_offset >> 5);
+    int bit_offset = (bs->bit_offset & 0x1f);
+    int bit_left = 32 - bit_offset;
+
+    if (!size_in_bits)
+        return;
+
+    bs->bit_offset += size_in_bits;
+
+    if (bit_left > size_in_bits) {
+        bs->buffer[pos] = (bs->buffer[pos] << size_in_bits | val);
+    } else {
+        size_in_bits -= bit_left;
+        bs->buffer[pos] = (bs->buffer[pos] << bit_left) | (val >> size_in_bits);
+        bs->buffer[pos] = va_swap32(bs->buffer[pos]);
+
+        if (pos + 1 == bs->max_size_in_dword) {
+            bs->max_size_in_dword += BITSTREAM_ALLOCATE_STEPPING;
+            bs->buffer = (unsigned int *)realloc(bs->buffer, bs->max_size_in_dword * sizeof(unsigned int));
+        }
+
+        bs->buffer[pos + 1] = val;
+    }
+}
+
+static void
+bitstream_put_ue(bitstream *bs, unsigned int val)
+{
+    int size_in_bits = 0;
+    int tmp_val = ++val;
+
+    while (tmp_val) {
+        tmp_val >>= 1;
+        size_in_bits++;
+    }
+
+    bitstream_put_ui(bs, 0, size_in_bits - 1); // leading zero
+    bitstream_put_ui(bs, val, size_in_bits);
+}
+
+static void
+bitstream_put_se(bitstream *bs, int val)
+{
+    unsigned int new_val;
+
+    if (val <= 0)
+        new_val = -2 * val;
+    else
+        new_val = 2 * val - 1;
+
+    bitstream_put_ue(bs, new_val);
+}
+
+static void
+bitstream_byte_aligning(bitstream *bs, int bit)
+{
+    int bit_offset = (bs->bit_offset & 0x7);
+    int bit_left = 8 - bit_offset;
+    int new_val;
+
+    if (!bit_offset)
+        return;
+
+    assert(bit == 0 || bit == 1);
+
+    if (bit)
+        new_val = (1 << bit_left) - 1;
+    else
+        new_val = 0;
+
+    bitstream_put_ui(bs, new_val, bit_left);
+}
+
+static void 
+rbsp_trailing_bits(bitstream *bs)
+{
+    bitstream_put_ui(bs, 1, 1);
+    bitstream_byte_aligning(bs, 0);
+}
+
+static void nal_start_code_prefix(bitstream *bs)
+{
+    bitstream_put_ui(bs, 0x00000001, 32);
+}
+
+static void nal_header(bitstream *bs, int nal_ref_idc, int nal_unit_type)
+{
+    bitstream_put_ui(bs, 0, 1);                /* forbidden_zero_bit: 0 */
+    bitstream_put_ui(bs, nal_ref_idc, 2);
+    bitstream_put_ui(bs, nal_unit_type, 5);
+}
+
+static void sps_rbsp(bitstream *bs)
+{
+    int profile_idc = PROFILE_IDC_BASELINE;
+
+    if (h264_profile  == VAProfileH264High)
+        profile_idc = PROFILE_IDC_HIGH;
+    else if (h264_profile  == VAProfileH264Main)
+        profile_idc = PROFILE_IDC_MAIN;
+
+    bitstream_put_ui(bs, profile_idc, 8);               /* profile_idc */
+    bitstream_put_ui(bs, !!(constraint_set_flag & 1), 1);                         /* constraint_set0_flag */
+    bitstream_put_ui(bs, !!(constraint_set_flag & 2), 1);                         /* constraint_set1_flag */
+    bitstream_put_ui(bs, !!(constraint_set_flag & 4), 1);                         /* constraint_set2_flag */
+    bitstream_put_ui(bs, !!(constraint_set_flag & 8), 1);                         /* constraint_set3_flag */
+    bitstream_put_ui(bs, 0, 4);                         /* reserved_zero_4bits */
+    bitstream_put_ui(bs, seq_param.level_idc, 8);      /* level_idc */
+    bitstream_put_ue(bs, seq_param.seq_parameter_set_id);      /* seq_parameter_set_id */
+
+    if ( profile_idc == PROFILE_IDC_HIGH) {
+        bitstream_put_ue(bs, 1);        /* chroma_format_idc = 1, 4:2:0 */ 
+        bitstream_put_ue(bs, 0);        /* bit_depth_luma_minus8 */
+        bitstream_put_ue(bs, 0);        /* bit_depth_chroma_minus8 */
+        bitstream_put_ui(bs, 0, 1);     /* qpprime_y_zero_transform_bypass_flag */
+        bitstream_put_ui(bs, 0, 1);     /* seq_scaling_matrix_present_flag */
+    }
+
+    bitstream_put_ue(bs, seq_param.seq_fields.bits.log2_max_frame_num_minus4); /* log2_max_frame_num_minus4 */
+    bitstream_put_ue(bs, seq_param.seq_fields.bits.pic_order_cnt_type);        /* pic_order_cnt_type */
+
+    if (seq_param.seq_fields.bits.pic_order_cnt_type == 0)
+        bitstream_put_ue(bs, seq_param.seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4);     /* log2_max_pic_order_cnt_lsb_minus4 */
+    else {
+        assert(0);
+    }
+
+    bitstream_put_ue(bs, seq_param.max_num_ref_frames);        /* num_ref_frames */
+    bitstream_put_ui(bs, 0, 1);                                 /* gaps_in_frame_num_value_allowed_flag */
+
+    bitstream_put_ue(bs, seq_param.picture_width_in_mbs - 1);  /* pic_width_in_mbs_minus1 */
+    bitstream_put_ue(bs, seq_param.picture_height_in_mbs - 1); /* pic_height_in_map_units_minus1 */
+    bitstream_put_ui(bs, seq_param.seq_fields.bits.frame_mbs_only_flag, 1);    /* frame_mbs_only_flag */
+
+    if (!seq_param.seq_fields.bits.frame_mbs_only_flag) {
+        assert(0);
+    }
+
+    bitstream_put_ui(bs, seq_param.seq_fields.bits.direct_8x8_inference_flag, 1);      /* direct_8x8_inference_flag */
+    bitstream_put_ui(bs, seq_param.frame_cropping_flag, 1);            /* frame_cropping_flag */
+
+    if (seq_param.frame_cropping_flag) {
+        bitstream_put_ue(bs, seq_param.frame_crop_left_offset);        /* frame_crop_left_offset */
+        bitstream_put_ue(bs, seq_param.frame_crop_right_offset);       /* frame_crop_right_offset */
+        bitstream_put_ue(bs, seq_param.frame_crop_top_offset);         /* frame_crop_top_offset */
+        bitstream_put_ue(bs, seq_param.frame_crop_bottom_offset);      /* frame_crop_bottom_offset */
+    }
+    
+    //if ( frame_bit_rate < 0 ) { //TODO EW: the vui header isn't correct
+    if ( false ) {
+        bitstream_put_ui(bs, 0, 1); /* vui_parameters_present_flag */
+    } else {
+        bitstream_put_ui(bs, 1, 1); /* vui_parameters_present_flag */
+        bitstream_put_ui(bs, 0, 1); /* aspect_ratio_info_present_flag */
+        bitstream_put_ui(bs, 0, 1); /* overscan_info_present_flag */
+        bitstream_put_ui(bs, 0, 1); /* video_signal_type_present_flag */
+        bitstream_put_ui(bs, 0, 1); /* chroma_loc_info_present_flag */
+        bitstream_put_ui(bs, 1, 1); /* timing_info_present_flag */
+        {
+            bitstream_put_ui(bs, 1, 32);  // FPS
+            bitstream_put_ui(bs, frame_rate * 2, 32);  // FPS
+            bitstream_put_ui(bs, 1, 1);
+        }
+        bitstream_put_ui(bs, 1, 1); /* nal_hrd_parameters_present_flag */
+        {
+            // hrd_parameters 
+            bitstream_put_ue(bs, 0);    /* cpb_cnt_minus1 */
+            bitstream_put_ui(bs, 4, 4); /* bit_rate_scale */
+            bitstream_put_ui(bs, 6, 4); /* cpb_size_scale */
+           
+            bitstream_put_ue(bs, frame_bitrate - 1); /* bit_rate_value_minus1[0] */
+            bitstream_put_ue(bs, frame_bitrate*8 - 1); /* cpb_size_value_minus1[0] */
+            bitstream_put_ui(bs, 1, 1);  /* cbr_flag[0] */
+
+            bitstream_put_ui(bs, 23, 5);   /* initial_cpb_removal_delay_length_minus1 */
+            bitstream_put_ui(bs, 23, 5);   /* cpb_removal_delay_length_minus1 */
+            bitstream_put_ui(bs, 23, 5);   /* dpb_output_delay_length_minus1 */
+            bitstream_put_ui(bs, 23, 5);   /* time_offset_length  */
+        }
+        bitstream_put_ui(bs, 0, 1);   /* vcl_hrd_parameters_present_flag */
+        bitstream_put_ui(bs, 0, 1);   /* low_delay_hrd_flag */ 
+
+        bitstream_put_ui(bs, 0, 1); /* pic_struct_present_flag */
+        bitstream_put_ui(bs, 0, 1); /* bitstream_restriction_flag */
+    }
+
+    rbsp_trailing_bits(bs);     /* rbsp_trailing_bits */
+}
+
+
+static void pps_rbsp(bitstream *bs)
+{
+    bitstream_put_ue(bs, pic_param.pic_parameter_set_id);      /* pic_parameter_set_id */
+    bitstream_put_ue(bs, pic_param.seq_parameter_set_id);      /* seq_parameter_set_id */
+
+    bitstream_put_ui(bs, pic_param.pic_fields.bits.entropy_coding_mode_flag, 1);  /* entropy_coding_mode_flag */
+
+    bitstream_put_ui(bs, 0, 1);                         /* pic_order_present_flag: 0 */
+
+    bitstream_put_ue(bs, 0);                            /* num_slice_groups_minus1 */
+
+    bitstream_put_ue(bs, pic_param.num_ref_idx_l0_active_minus1);      /* num_ref_idx_l0_active_minus1 */
+    bitstream_put_ue(bs, pic_param.num_ref_idx_l1_active_minus1);      /* num_ref_idx_l1_active_minus1 1 */
+
+    bitstream_put_ui(bs, pic_param.pic_fields.bits.weighted_pred_flag, 1);     /* weighted_pred_flag: 0 */
+    bitstream_put_ui(bs, pic_param.pic_fields.bits.weighted_bipred_idc, 2);    /* weighted_bipred_idc: 0 */
+
+    bitstream_put_se(bs, pic_param.pic_init_qp - 26);  /* pic_init_qp_minus26 */
+    bitstream_put_se(bs, 0);                            /* pic_init_qs_minus26 */
+    bitstream_put_se(bs, 0);                            /* chroma_qp_index_offset */
+
+    bitstream_put_ui(bs, pic_param.pic_fields.bits.deblocking_filter_control_present_flag, 1); /* deblocking_filter_control_present_flag */
+    bitstream_put_ui(bs, 0, 1);                         /* constrained_intra_pred_flag */
+    bitstream_put_ui(bs, 0, 1);                         /* redundant_pic_cnt_present_flag */
+    
+    /* more_rbsp_data */
+    bitstream_put_ui(bs, pic_param.pic_fields.bits.transform_8x8_mode_flag, 1);    /*transform_8x8_mode_flag */
+    bitstream_put_ui(bs, 0, 1);                         /* pic_scaling_matrix_present_flag */
+    bitstream_put_se(bs, pic_param.second_chroma_qp_index_offset );    /*second_chroma_qp_index_offset */
+
+    rbsp_trailing_bits(bs);
+}
+
+static void slice_header(bitstream *bs)
+{
+    int first_mb_in_slice = slice_param.macroblock_address;
+
+    bitstream_put_ue(bs, first_mb_in_slice);        /* first_mb_in_slice: 0 */
+    bitstream_put_ue(bs, slice_param.slice_type);   /* slice_type */
+    bitstream_put_ue(bs, slice_param.pic_parameter_set_id);        /* pic_parameter_set_id: 0 */
+    bitstream_put_ui(bs, pic_param.frame_num, seq_param.seq_fields.bits.log2_max_frame_num_minus4 + 4); /* frame_num */
+
+    /* frame_mbs_only_flag == 1 */
+    if (!seq_param.seq_fields.bits.frame_mbs_only_flag) {
+        /* FIXME: */
+        assert(0);
+    }
+
+    if (pic_param.pic_fields.bits.idr_pic_flag)
+        bitstream_put_ue(bs, slice_param.idr_pic_id);          /* idr_pic_id: 0 */
+
+    if (seq_param.seq_fields.bits.pic_order_cnt_type == 0) {
+        bitstream_put_ui(bs, pic_param.CurrPic.TopFieldOrderCnt, seq_param.seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4 + 4);
+        /* pic_order_present_flag == 0 */
+    } else {
+        /* FIXME: */
+        assert(0);
+    }
+
+    /* redundant_pic_cnt_present_flag == 0 */
+    /* slice type */
+    if (IS_P_SLICE(slice_param.slice_type)) {
+        bitstream_put_ui(bs, slice_param.num_ref_idx_active_override_flag, 1);            /* num_ref_idx_active_override_flag: */
+
+        if (slice_param.num_ref_idx_active_override_flag)
+            bitstream_put_ue(bs, slice_param.num_ref_idx_l0_active_minus1);
+
+        /* ref_pic_list_reordering */
+        bitstream_put_ui(bs, 0, 1);            /* ref_pic_list_reordering_flag_l0: 0 */
+    } else if (IS_B_SLICE(slice_param.slice_type)) {
+        bitstream_put_ui(bs, slice_param.direct_spatial_mv_pred_flag, 1);            /* direct_spatial_mv_pred: 1 */
+
+        bitstream_put_ui(bs, slice_param.num_ref_idx_active_override_flag, 1);       /* num_ref_idx_active_override_flag: */
+
+        if (slice_param.num_ref_idx_active_override_flag) {
+            bitstream_put_ue(bs, slice_param.num_ref_idx_l0_active_minus1);
+            bitstream_put_ue(bs, slice_param.num_ref_idx_l1_active_minus1);
+        }
+
+        /* ref_pic_list_reordering */
+        bitstream_put_ui(bs, 0, 1);            /* ref_pic_list_reordering_flag_l0: 0 */
+        bitstream_put_ui(bs, 0, 1);            /* ref_pic_list_reordering_flag_l1: 0 */
+    }
+
+    if ((pic_param.pic_fields.bits.weighted_pred_flag &&
+         IS_P_SLICE(slice_param.slice_type)) ||
+        ((pic_param.pic_fields.bits.weighted_bipred_idc == 1) &&
+         IS_B_SLICE(slice_param.slice_type))) {
+        /* FIXME: fill weight/offset table */
+        assert(0);
+    }
+
+    /* dec_ref_pic_marking */
+    if (pic_param.pic_fields.bits.reference_pic_flag) {     /* nal_ref_idc != 0 */
+        unsigned char no_output_of_prior_pics_flag = 0;
+        unsigned char long_term_reference_flag = 0;
+        unsigned char adaptive_ref_pic_marking_mode_flag = 0;
+
+        if (pic_param.pic_fields.bits.idr_pic_flag) {
+            bitstream_put_ui(bs, no_output_of_prior_pics_flag, 1);            /* no_output_of_prior_pics_flag: 0 */
+            bitstream_put_ui(bs, long_term_reference_flag, 1);            /* long_term_reference_flag: 0 */
+        } else {
+            bitstream_put_ui(bs, adaptive_ref_pic_marking_mode_flag, 1);            /* adaptive_ref_pic_marking_mode_flag: 0 */
+        }
+    }
+
+    if (pic_param.pic_fields.bits.entropy_coding_mode_flag &&
+        !IS_I_SLICE(slice_param.slice_type))
+        bitstream_put_ue(bs, slice_param.cabac_init_idc);               /* cabac_init_idc: 0 */
+
+    bitstream_put_se(bs, slice_param.slice_qp_delta);                   /* slice_qp_delta: 0 */
+
+    /* ignore for SP/SI */
+
+    if (pic_param.pic_fields.bits.deblocking_filter_control_present_flag) {
+        bitstream_put_ue(bs, slice_param.disable_deblocking_filter_idc);           /* disable_deblocking_filter_idc: 0 */
+
+        if (slice_param.disable_deblocking_filter_idc != 1) {
+            bitstream_put_se(bs, slice_param.slice_alpha_c0_offset_div2);          /* slice_alpha_c0_offset_div2: 2 */
+            bitstream_put_se(bs, slice_param.slice_beta_offset_div2);              /* slice_beta_offset_div2: 2 */
+        }
+    }
+
+    if (pic_param.pic_fields.bits.entropy_coding_mode_flag) {
+        bitstream_byte_aligning(bs, 1);
+    }
+}
+
+static int
+build_packed_pic_buffer(unsigned char **header_buffer)
+{
+    bitstream bs;
+
+    bitstream_start(&bs);
+    nal_start_code_prefix(&bs);
+    nal_header(&bs, NAL_REF_IDC_HIGH, NAL_PPS);
+    pps_rbsp(&bs);
+    bitstream_end(&bs);
+
+    *header_buffer = (unsigned char *)bs.buffer;
+    return bs.bit_offset;
+}
+
+static int
+build_packed_seq_buffer(unsigned char **header_buffer)
+{
+    bitstream bs;
+
+    bitstream_start(&bs);
+    nal_start_code_prefix(&bs);
+    nal_header(&bs, NAL_REF_IDC_HIGH, NAL_SPS);
+    sps_rbsp(&bs);
+    bitstream_end(&bs);
+
+    *header_buffer = (unsigned char *)bs.buffer;
+    return bs.bit_offset;
+}
+
+static int build_packed_slice_buffer(unsigned char **header_buffer)
+{
+    bitstream bs;
+    int is_idr = !!pic_param.pic_fields.bits.idr_pic_flag;
+    int is_ref = !!pic_param.pic_fields.bits.reference_pic_flag;
+
+    bitstream_start(&bs);
+    nal_start_code_prefix(&bs);
+
+    if (IS_I_SLICE(slice_param.slice_type)) {
+        nal_header(&bs, NAL_REF_IDC_HIGH, is_idr ? NAL_IDR : NAL_NON_IDR);
+    } else if (IS_P_SLICE(slice_param.slice_type)) {
+        nal_header(&bs, NAL_REF_IDC_MEDIUM, NAL_NON_IDR);
+    } else {
+        assert(IS_B_SLICE(slice_param.slice_type));
+        nal_header(&bs, is_ref ? NAL_REF_IDC_LOW : NAL_REF_IDC_NONE, NAL_NON_IDR);
+    }
+
+    slice_header(&bs);
+    bitstream_end(&bs);
+
+    *header_buffer = (unsigned char *)bs.buffer;
+    return bs.bit_offset;
+}
+
+
+/*
+  Assume frame sequence is: Frame#0, #1, #2, ..., #M, ..., #X, ... (encoding order)
+  1) period between Frame #X and Frame #N = #X - #N
+  2) 0 means infinite for intra_period/intra_idr_period, and 0 is invalid for ip_period
+  3) intra_idr_period % intra_period (intra_period > 0) and intra_period % ip_period must be 0
+  4) intra_period and intra_idr_period take precedence over ip_period
+  5) if ip_period > 1, intra_period and intra_idr_period are not  the strict periods 
+     of I/IDR frames, see bellow examples
+  -------------------------------------------------------------------
+  intra_period intra_idr_period ip_period frame sequence (intra_period/intra_idr_period/ip_period)
+  0            ignored          1          IDRPPPPPPP ...     (No IDR/I any more)
+  0            ignored        >=2          IDR(PBB)(PBB)...   (No IDR/I any more)
+  1            0                ignored    IDRIIIIIII...      (No IDR any more)
+  1            1                ignored    IDR IDR IDR IDR...
+  1            >=2              ignored    IDRII IDRII IDR... (1/3/ignore)
+  >=2          0                1          IDRPPP IPPP I...   (3/0/1)
+  >=2          0              >=2          IDR(PBB)(PBB)(IBB) (6/0/3)
+                                              (PBB)(IBB)(PBB)(IBB)... 
+  >=2          >=2              1          IDRPPPPP IPPPPP IPPPPP (6/18/1)
+                                           IDRPPPPP IPPPPP IPPPPP...
+  >=2          >=2              >=2        {IDR(PBB)(PBB)(IBB)(PBB)(IBB)(PBB)} (6/18/3)
+                                           {IDR(PBB)(PBB)(IBB)(PBB)(IBB)(PBB)}...
+                                           {IDR(PBB)(PBB)(IBB)(PBB)}           (6/12/3)
+                                           {IDR(PBB)(PBB)(IBB)(PBB)}...
+                                           {IDR(PBB)(PBB)}                     (6/6/3)
+                                           {IDR(PBB)(PBB)}.
+*/
+
+/*
+ * Return displaying order with specified periods and encoding order
+ * displaying_order: displaying order
+ * frame_type: frame type 
+ */
+#define FRAME_P 0
+#define FRAME_B 1
+#define FRAME_I 2
+#define FRAME_IDR 7
+void encoding2display_order(
+    unsigned long long encoding_order, int intra_period,
+    int intra_idr_period, int ip_period,
+    unsigned long long *displaying_order,
+    int *frame_type)
+{
+    int encoding_order_gop = 0;
+
+    if (intra_period == 1) { /* all are I/IDR frames */
+        *displaying_order = encoding_order;
+        if (intra_idr_period == 0)
+            *frame_type = (encoding_order == 0)?FRAME_IDR:FRAME_I;
+        else
+            *frame_type = (encoding_order % intra_idr_period == 0)?FRAME_IDR:FRAME_I;
+        return;
+    }
+
+    if (intra_period == 0)
+        intra_idr_period = 0;
+
+    /* new sequence like
+     * IDR PPPPP IPPPPP
+     * IDR (PBB)(PBB)(IBB)(PBB)
+     */
+    encoding_order_gop = (intra_idr_period == 0)? encoding_order:
+        (encoding_order % (intra_idr_period + ((ip_period == 1)?0:1)));
+         
+    if (encoding_order_gop == 0) { /* the first frame */
+        *frame_type = FRAME_IDR;
+        *displaying_order = encoding_order;
+    } else if (((encoding_order_gop - 1) % ip_period) != 0) { /* B frames */
+       *frame_type = FRAME_B;
+        *displaying_order = encoding_order - 1;
+    } else if ((intra_period != 0) && /* have I frames */
+               (encoding_order_gop >= 2) &&
+               ((ip_period == 1 && encoding_order_gop % intra_period == 0) || /* for IDR PPPPP IPPPP */
+                /* for IDR (PBB)(PBB)(IBB) */
+                (ip_period >= 2 && ((encoding_order_gop - 1) / ip_period % (intra_period / ip_period)) == 0))) {
+       *frame_type = FRAME_I;
+       *displaying_order = encoding_order + ip_period - 1;
+    } else {
+       *frame_type = FRAME_P;
+       *displaying_order = encoding_order + ip_period - 1;
+    }
+}
+
+
+static const char *rc_to_string(int rcmode)
+{
+    switch (rc_mode) {
+    case VA_RC_NONE:
+        return "NONE";
+    case VA_RC_CBR:
+        return "CBR";
+    case VA_RC_VBR:
+        return "VBR";
+    case VA_RC_VCM:
+        return "VCM";
+    case VA_RC_CQP:
+        return "CQP";
+    case VA_RC_VBR_CONSTRAINED:
+        return "VBR_CONSTRAINED";
+    default:
+        return "Unknown";
+    }
+}
+
+#if 0
+static int process_cmdline(int argc, char *argv[])
+{
+    char c;
+    const struct option long_opts[] = {
+        {"help", no_argument, NULL, 0 },
+        {"bitrate", required_argument, NULL, 1 },
+        {"minqp", required_argument, NULL, 2 },
+        {"initialqp", required_argument, NULL, 3 },
+        {"intra_period", required_argument, NULL, 4 },
+        {"idr_period", required_argument, NULL, 5 },
+        {"ip_period", required_argument, NULL, 6 },
+        {"rcmode", required_argument, NULL, 7 },
+        {"srcyuv", required_argument, NULL, 9 },
+        {"recyuv", required_argument, NULL, 10 },
+        {"fourcc", required_argument, NULL, 11 },
+        {"syncmode", no_argument, NULL, 12 },
+        {"enablePSNR", no_argument, NULL, 13 },
+        {"prit", required_argument, NULL, 14 },
+        {"priv", required_argument, NULL, 15 },
+        {"framecount", required_argument, NULL, 16 },
+        {"entropy", required_argument, NULL, 17 },
+        {"profile", required_argument, NULL, 18 },
+        {NULL, no_argument, NULL, 0 }};
+    int long_index;
+    
+    while ((c =getopt_long_only(argc, argv, "w:h:n:f:o:?", long_opts, &long_index)) != EOF) {
+        switch (c) {
+        case 'w':
+            frame_width = atoi(optarg);
+            break;
+        case 'h':
+            frame_height = atoi(optarg);
+            break;
+        case 'n':
+        case 'f':
+            frame_rate = atoi(optarg);
+            break;
+        case 'o':
+            coded_fn = strdup(optarg);
+            break;
+        case 0:
+            print_help();
+            exit(0);
+        case 1:
+            frame_bitrate = atoi(optarg);
+            break;
+        case 2:
+            minimal_qp = atoi(optarg);
+            break;
+        case 3:
+            initial_qp = atoi(optarg);
+            break;
+        case 4:
+            intra_period = atoi(optarg);
+            break;
+        case 5:
+            intra_idr_period = atoi(optarg);
+            break;
+        case 6:
+            ip_period = atoi(optarg);
+            break;
+        case 7:
+            rc_mode = string_to_rc(optarg);
+            if (rc_mode < 0) {
+                print_help();
+                exit(1);
+            }
+            break;
+        case 9:
+            srcyuv_fn = strdup(optarg);
+            break;
+        case 11:
+            srcyuv_fourcc = string_to_fourcc(optarg);
+            if (srcyuv_fourcc <= 0) {
+                print_help();
+                exit(1);
+            }
+            break;
+        case 13:
+            calc_psnr = 1;
+            break;
+        case 14:
+            misc_priv_type = strtol(optarg, NULL, 0);
+            break;
+        case 15:
+            misc_priv_value = strtol(optarg, NULL, 0);
+            break;
+        case 17:
+            h264_entropy_mode = atoi(optarg) ? 1: 0;
+            break;
+        case 18:
+            if (strncmp(optarg, "BP", 2) == 0)
+                h264_profile = VAProfileH264Baseline;
+            else if (strncmp(optarg, "MP", 2) == 0)
+                h264_profile = VAProfileH264Main;
+            else if (strncmp(optarg, "HP", 2) == 0)
+                h264_profile = VAProfileH264High;
+            else
+                h264_profile = (VAProfile)0;
+            break;
+        case ':':
+        case '?':
+            print_help();
+            exit(0);
+        }
+    }
+
+    if (ip_period < 1) {
+       printf(" ip_period must be greater than 0\n");
+        exit(0);
+    }
+    if (intra_period != 1 && intra_period % ip_period != 0) {
+       printf(" intra_period must be a multiplier of ip_period\n");
+        exit(0);        
+    }
+    if (intra_period != 0 && intra_idr_period % intra_period != 0) {
+       printf(" intra_idr_period must be a multiplier of intra_period\n");
+        exit(0);        
+    }
+
+    if (frame_bitrate == 0)
+        frame_bitrate = frame_width * frame_height * 12 * frame_rate / 50;
+        
+    if (coded_fn == NULL) {
+        struct stat buf;
+        if (stat("/tmp", &buf) == 0)
+            coded_fn = strdup("/tmp/test.264");
+        else if (stat("/sdcard", &buf) == 0)
+            coded_fn = strdup("/sdcard/test.264");
+        else
+            coded_fn = strdup("./test.264");
+    }
+    
+    /* store coded data into a file */
+    coded_fp = fopen(coded_fn, "w+");
+    if (coded_fp == NULL) {
+        printf("Open file %s failed, exit\n", coded_fn);
+        exit(1);
+    }
+
+    frame_width_mbaligned = (frame_width + 15) & (~15);
+    frame_height_mbaligned = (frame_height + 15) & (~15);
+    if (frame_width != frame_width_mbaligned ||
+        frame_height != frame_height_mbaligned) {
+        printf("Source frame is %dx%d and will code clip to %dx%d with crop\n",
+               frame_width, frame_height,
+               frame_width_mbaligned, frame_height_mbaligned
+               );
+    }
+    
+    return 0;
+}
+#endif
+
+static Display *x11_display;
+static Window   x11_window;
+
+VADisplay
+va_open_display(void)
+{
+    x11_display = XOpenDisplay(NULL);
+    if (!x11_display) {
+        fprintf(stderr, "error: can't connect to X server!\n");
+        return NULL;
+    }
+    return vaGetDisplay(x11_display);
+}
+
+void
+va_close_display(VADisplay va_dpy)
+{
+    if (!x11_display)
+        return;
+
+    if (x11_window) {
+        XUnmapWindow(x11_display, x11_window);
+        XDestroyWindow(x11_display, x11_window);
+        x11_window = None;
+    }
+    XCloseDisplay(x11_display);
+    x11_display = NULL;
+}
+
+static int init_va(void)
+{
+    VAProfile profile_list[]={VAProfileH264High, VAProfileH264Main, VAProfileH264Baseline, VAProfileH264ConstrainedBaseline};
+    VAEntrypoint *entrypoints;
+    int num_entrypoints, slice_entrypoint;
+    int support_encode = 0;    
+    int major_ver, minor_ver;
+    VAStatus va_status;
+    unsigned int i;
+
+    va_dpy = va_open_display();
+    va_status = vaInitialize(va_dpy, &major_ver, &minor_ver);
+    CHECK_VASTATUS(va_status, "vaInitialize");
+
+    num_entrypoints = vaMaxNumEntrypoints(va_dpy);
+    entrypoints = (VAEntrypoint *)malloc(num_entrypoints * sizeof(*entrypoints));
+    if (!entrypoints) {
+        fprintf(stderr, "error: failed to initialize VA entrypoints array\n");
+        exit(1);
+    }
+
+    /* use the highest profile */
+    for (i = 0; i < sizeof(profile_list)/sizeof(profile_list[0]); i++) {
+        if ((h264_profile != ~0) && h264_profile != profile_list[i])
+            continue;
+        
+        h264_profile = profile_list[i];
+        vaQueryConfigEntrypoints(va_dpy, h264_profile, entrypoints, &num_entrypoints);
+        for (slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) {
+            if (entrypoints[slice_entrypoint] == VAEntrypointEncSlice) {
+                support_encode = 1;
+                break;
+            }
+        }
+        if (support_encode == 1)
+            break;
+    }
+    
+    if (support_encode == 0) {
+        printf("Can't find VAEntrypointEncSlice for H264 profiles\n");
+        exit(1);
+    } else {
+        switch (h264_profile) {
+            case VAProfileH264Baseline:
+                printf("Use profile VAProfileH264Baseline\n");
+                ip_period = 1;
+                constraint_set_flag |= (1 << 0); /* Annex A.2.1 */
+                h264_entropy_mode = 0;
+                break;
+            case VAProfileH264ConstrainedBaseline:
+                printf("Use profile VAProfileH264ConstrainedBaseline\n");
+                constraint_set_flag |= (1 << 0 | 1 << 1); /* Annex A.2.2 */
+                ip_period = 1;
+                break;
+
+            case VAProfileH264Main:
+                printf("Use profile VAProfileH264Main\n");
+                constraint_set_flag |= (1 << 1); /* Annex A.2.2 */
+                break;
+
+            case VAProfileH264High:
+                constraint_set_flag |= (1 << 3); /* Annex A.2.4 */
+                printf("Use profile VAProfileH264High\n");
+                break;
+            default:
+                printf("unknow profile. Set to Baseline");
+                h264_profile = VAProfileH264Baseline;
+                ip_period = 1;
+                constraint_set_flag |= (1 << 0); /* Annex A.2.1 */
+                break;
+        }
+    }
+
+    VAConfigAttrib attrib[VAConfigAttribTypeMax];
+
+    /* find out the format for the render target, and rate control mode */
+    for (i = 0; i < VAConfigAttribTypeMax; i++)
+        attrib[i].type = (VAConfigAttribType)i;
+
+    va_status = vaGetConfigAttributes(va_dpy, h264_profile, VAEntrypointEncSlice,
+                                      &attrib[0], VAConfigAttribTypeMax);
+    CHECK_VASTATUS(va_status, "vaGetConfigAttributes");
+    /* check the interested configattrib */
+    if ((attrib[VAConfigAttribRTFormat].value & VA_RT_FORMAT_YUV420) == 0) {
+        printf("Not find desired YUV420 RT format\n");
+        exit(1);
+    } else {
+        config_attrib[config_attrib_num].type = VAConfigAttribRTFormat;
+        config_attrib[config_attrib_num].value = VA_RT_FORMAT_YUV420;
+        config_attrib_num++;
+    }
+    
+    if (attrib[VAConfigAttribRateControl].value != VA_ATTRIB_NOT_SUPPORTED) {
+        int tmp = attrib[VAConfigAttribRateControl].value;
+
+        printf("Support rate control mode (0x%x):", tmp);
+        
+        if (tmp & VA_RC_NONE)
+            printf("NONE ");
+        if (tmp & VA_RC_CBR)
+            printf("CBR ");
+        if (tmp & VA_RC_VBR)
+            printf("VBR ");
+        if (tmp & VA_RC_VCM)
+            printf("VCM ");
+        if (tmp & VA_RC_CQP)
+            printf("CQP ");
+        if (tmp & VA_RC_VBR_CONSTRAINED)
+            printf("VBR_CONSTRAINED ");
+
+        printf("\n");
+
+        if (rc_mode == -1 || !(rc_mode & tmp))  {
+            if (rc_mode != -1) {
+                printf("Warning: Don't support the specified RateControl mode: %s!!!, switch to ", rc_to_string(rc_mode));
+            }
+
+            for (i = 0; i < sizeof(rc_default_modes) / sizeof(rc_default_modes[0]); i++) {
+                if (rc_default_modes[i] & tmp) {
+                    rc_mode = rc_default_modes[i];
+                    break;
+                }
+            }
+
+            printf("RateControl mode: %s\n", rc_to_string(rc_mode));
+        }
+
+        config_attrib[config_attrib_num].type = VAConfigAttribRateControl;
+        config_attrib[config_attrib_num].value = rc_mode;
+        config_attrib_num++;
+    }
+    
+
+    if (attrib[VAConfigAttribEncPackedHeaders].value != VA_ATTRIB_NOT_SUPPORTED) {
+        int tmp = attrib[VAConfigAttribEncPackedHeaders].value;
+
+        printf("Support VAConfigAttribEncPackedHeaders\n");
+        
+        h264_packedheader = 1;
+        config_attrib[config_attrib_num].type = VAConfigAttribEncPackedHeaders;
+        config_attrib[config_attrib_num].value = VA_ENC_PACKED_HEADER_NONE;
+        
+        if (tmp & VA_ENC_PACKED_HEADER_SEQUENCE) {
+            printf("Support packed sequence headers\n");
+            config_attrib[config_attrib_num].value |= VA_ENC_PACKED_HEADER_SEQUENCE;
+        }
+        
+        if (tmp & VA_ENC_PACKED_HEADER_PICTURE) {
+            printf("Support packed picture headers\n");
+            config_attrib[config_attrib_num].value |= VA_ENC_PACKED_HEADER_PICTURE;
+        }
+        
+        if (tmp & VA_ENC_PACKED_HEADER_SLICE) {
+            printf("Support packed slice headers\n");
+            config_attrib[config_attrib_num].value |= VA_ENC_PACKED_HEADER_SLICE;
+        }
+        
+        if (tmp & VA_ENC_PACKED_HEADER_MISC) {
+            printf("Support packed misc headers\n");
+            config_attrib[config_attrib_num].value |= VA_ENC_PACKED_HEADER_MISC;
+        }
+        
+        enc_packed_header_idx = config_attrib_num;
+        config_attrib_num++;
+    }
+
+    if (attrib[VAConfigAttribEncInterlaced].value != VA_ATTRIB_NOT_SUPPORTED) {
+        int tmp = attrib[VAConfigAttribEncInterlaced].value;
+        
+        printf("Support VAConfigAttribEncInterlaced\n");
+
+        if (tmp & VA_ENC_INTERLACED_FRAME)
+            printf("support VA_ENC_INTERLACED_FRAME\n");
+        if (tmp & VA_ENC_INTERLACED_FIELD)
+            printf("Support VA_ENC_INTERLACED_FIELD\n");
+        if (tmp & VA_ENC_INTERLACED_MBAFF)
+            printf("Support VA_ENC_INTERLACED_MBAFF\n");
+        if (tmp & VA_ENC_INTERLACED_PAFF)
+            printf("Support VA_ENC_INTERLACED_PAFF\n");
+        
+        config_attrib[config_attrib_num].type = VAConfigAttribEncInterlaced;
+        config_attrib[config_attrib_num].value = VA_ENC_PACKED_HEADER_NONE;
+        config_attrib_num++;
+    }
+    
+    if (attrib[VAConfigAttribEncMaxRefFrames].value != VA_ATTRIB_NOT_SUPPORTED) {
+        h264_maxref = attrib[VAConfigAttribEncMaxRefFrames].value;
+        
+        printf("Support %d RefPicList0 and %d RefPicList1\n",
+               h264_maxref & 0xffff, (h264_maxref >> 16) & 0xffff );
+    }
+
+    if (attrib[VAConfigAttribEncMaxSlices].value != VA_ATTRIB_NOT_SUPPORTED)
+        printf("Support %d slices\n", attrib[VAConfigAttribEncMaxSlices].value);
+
+    if (attrib[VAConfigAttribEncSliceStructure].value != VA_ATTRIB_NOT_SUPPORTED) {
+        int tmp = attrib[VAConfigAttribEncSliceStructure].value;
+        
+        printf("Support VAConfigAttribEncSliceStructure\n");
+
+        if (tmp & VA_ENC_SLICE_STRUCTURE_ARBITRARY_ROWS)
+            printf("Support VA_ENC_SLICE_STRUCTURE_ARBITRARY_ROWS\n");
+        if (tmp & VA_ENC_SLICE_STRUCTURE_POWER_OF_TWO_ROWS)
+            printf("Support VA_ENC_SLICE_STRUCTURE_POWER_OF_TWO_ROWS\n");
+        if (tmp & VA_ENC_SLICE_STRUCTURE_ARBITRARY_MACROBLOCKS)
+            printf("Support VA_ENC_SLICE_STRUCTURE_ARBITRARY_MACROBLOCKS\n");
+    }
+    if (attrib[VAConfigAttribEncMacroblockInfo].value != VA_ATTRIB_NOT_SUPPORTED) {
+        printf("Support VAConfigAttribEncMacroblockInfo\n");
+    }
+
+    free(entrypoints);
+    return 0;
+}
+
+static int setup_encode()
+{
+    VAStatus va_status;
+    VASurfaceID *tmp_surfaceid;
+    int codedbuf_size, i;
+    static VASurfaceID src_surface[SURFACE_NUM];
+    static VASurfaceID ref_surface[SURFACE_NUM];
+    
+    va_status = vaCreateConfig(va_dpy, h264_profile, VAEntrypointEncSlice,
+            &config_attrib[0], config_attrib_num, &config_id);
+    CHECK_VASTATUS(va_status, "vaCreateConfig");
+
+    /* create source surfaces */
+    va_status = vaCreateSurfaces(va_dpy,
+                                 VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned,
+                                 &src_surface[0], SURFACE_NUM,
+                                 NULL, 0);
+    CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+    /* create reference surfaces */
+    va_status = vaCreateSurfaces(va_dpy,
+                                 VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned,
+                                &ref_surface[0], SURFACE_NUM,
+                                NULL, 0);
+    CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+    tmp_surfaceid = (VASurfaceID *)calloc(2 * SURFACE_NUM, sizeof(VASurfaceID));
+    memcpy(tmp_surfaceid, src_surface, SURFACE_NUM * sizeof(VASurfaceID));
+    memcpy(tmp_surfaceid + SURFACE_NUM, ref_surface, SURFACE_NUM * sizeof(VASurfaceID));
+    
+    /* Create a context for this encode pipe */
+    va_status = vaCreateContext(va_dpy, config_id,
+                                frame_width_mbaligned, frame_height_mbaligned,
+                                VA_PROGRESSIVE,
+                                tmp_surfaceid, 2 * SURFACE_NUM,
+                                &context_id);
+    CHECK_VASTATUS(va_status, "vaCreateContext");
+    free(tmp_surfaceid);
+
+    codedbuf_size = (frame_width_mbaligned * frame_height_mbaligned * 400) / (16*16);
+
+    for (i = 0; i < SURFACE_NUM; i++) {
+        /* create coded buffer once for all
+         * other VA buffers which won't be used again after vaRenderPicture.
+         * so APP can always vaCreateBuffer for every frame
+         * but coded buffer need to be mapped and accessed after vaRenderPicture/vaEndPicture
+         * so VA won't maintain the coded buffer
+         */
+        va_status = vaCreateBuffer(va_dpy, context_id, VAEncCodedBufferType,
+                codedbuf_size, 1, NULL, &gl_surfaces[i].coded_buf);
+        CHECK_VASTATUS(va_status, "vaCreateBuffer");
+    }
+
+    /* create OpenGL objects */
+    //glGenFramebuffers(SURFACE_NUM, fbos);
+    
+    for (i = 0; i < SURFACE_NUM; i++) {
+        glGenTextures(1, &gl_surfaces[i].y_tex);
+        glGenTextures(1, &gl_surfaces[i].cbcr_tex);
+    }
+
+    for (i = 0; i < SURFACE_NUM; i++) {
+        gl_surfaces[i].src_surface = src_surface[i];
+        gl_surfaces[i].ref_surface = ref_surface[i];
+    }
+    
+    return 0;
+}
+
+
+
+#define partition(ref, field, key, ascending)   \
+    while (i <= j) {                            \
+        if (ascending) {                        \
+            while (ref[i].field < key)          \
+                i++;                            \
+            while (ref[j].field > key)          \
+                j--;                            \
+        } else {                                \
+            while (ref[i].field > key)          \
+                i++;                            \
+            while (ref[j].field < key)          \
+                j--;                            \
+        }                                       \
+        if (i <= j) {                           \
+            tmp = ref[i];                       \
+            ref[i] = ref[j];                    \
+            ref[j] = tmp;                       \
+            i++;                                \
+            j--;                                \
+        }                                       \
+    }                                           \
+
+static void sort_one(VAPictureH264 ref[], int left, int right,
+                     int ascending, int frame_idx)
+{
+    int i = left, j = right;
+    unsigned int key;
+    VAPictureH264 tmp;
+
+    if (frame_idx) {
+        key = ref[(left + right) / 2].frame_idx;
+        partition(ref, frame_idx, key, ascending);
+    } else {
+        key = ref[(left + right) / 2].TopFieldOrderCnt;
+        partition(ref, TopFieldOrderCnt, (signed int)key, ascending);
+    }
+    
+    /* recursion */
+    if (left < j)
+        sort_one(ref, left, j, ascending, frame_idx);
+    
+    if (i < right)
+        sort_one(ref, i, right, ascending, frame_idx);
+}
+
+static void sort_two(VAPictureH264 ref[], int left, int right, unsigned int key, unsigned int frame_idx,
+                     int partition_ascending, int list0_ascending, int list1_ascending)
+{
+    int i = left, j = right;
+    VAPictureH264 tmp;
+
+    if (frame_idx) {
+        partition(ref, frame_idx, key, partition_ascending);
+    } else {
+        partition(ref, TopFieldOrderCnt, (signed int)key, partition_ascending);
+    }
+    
+
+    sort_one(ref, left, i-1, list0_ascending, frame_idx);
+    sort_one(ref, j+1, right, list1_ascending, frame_idx);
+}
+
+static int update_ReferenceFrames(void)
+{
+    int i;
+    
+    if (current_frame_type == FRAME_B)
+        return 0;
+
+    CurrentCurrPic.flags = VA_PICTURE_H264_SHORT_TERM_REFERENCE;
+    numShortTerm++;
+    if (numShortTerm > num_ref_frames)
+        numShortTerm = num_ref_frames;
+    for (i=numShortTerm-1; i>0; i--)
+        ReferenceFrames[i] = ReferenceFrames[i-1];
+    ReferenceFrames[0] = CurrentCurrPic;
+    
+    if (current_frame_type != FRAME_B)
+        current_frame_num++;
+    if (current_frame_num > MaxFrameNum)
+        current_frame_num = 0;
+    
+    return 0;
+}
+
+
+static int update_RefPicList(void)
+{
+    unsigned int current_poc = CurrentCurrPic.TopFieldOrderCnt;
+    
+    if (current_frame_type == FRAME_P) {
+        memcpy(RefPicList0_P, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
+        sort_one(RefPicList0_P, 0, numShortTerm-1, 0, 1);
+    }
+    
+    if (current_frame_type == FRAME_B) {
+        memcpy(RefPicList0_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
+        sort_two(RefPicList0_B, 0, numShortTerm-1, current_poc, 0,
+                 1, 0, 1);
+
+        memcpy(RefPicList1_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
+        sort_two(RefPicList1_B, 0, numShortTerm-1, current_poc, 0,
+                 0, 1, 0);
+    }
+    
+    return 0;
+}
+
+
+static int render_sequence(void)
+{
+    VABufferID seq_param_buf, rc_param_buf, misc_param_tmpbuf, render_id[2];
+    VAStatus va_status;
+    VAEncMiscParameterBuffer *misc_param, *misc_param_tmp;
+    VAEncMiscParameterRateControl *misc_rate_ctrl;
+    
+    seq_param.level_idc = 41 /*SH_LEVEL_3*/;
+    seq_param.picture_width_in_mbs = frame_width_mbaligned / 16;
+    seq_param.picture_height_in_mbs = frame_height_mbaligned / 16;
+    seq_param.bits_per_second = frame_bitrate;
+
+    seq_param.intra_period = intra_period;
+    seq_param.intra_idr_period = intra_idr_period;
+    seq_param.ip_period = ip_period;
+
+    seq_param.max_num_ref_frames = num_ref_frames;
+    seq_param.seq_fields.bits.frame_mbs_only_flag = 1;
+    seq_param.time_scale = frame_rate * 2;
+    seq_param.num_units_in_tick = 1; /* Tc = num_units_in_tick / scale */
+    seq_param.seq_fields.bits.log2_max_pic_order_cnt_lsb_minus4 = Log2MaxPicOrderCntLsb - 4;
+    seq_param.seq_fields.bits.log2_max_frame_num_minus4 = Log2MaxFrameNum - 4;;
+    seq_param.seq_fields.bits.frame_mbs_only_flag = 1;
+    seq_param.seq_fields.bits.chroma_format_idc = 1;
+    seq_param.seq_fields.bits.direct_8x8_inference_flag = 1;
+    
+    if (frame_width != frame_width_mbaligned ||
+        frame_height != frame_height_mbaligned) {
+        seq_param.frame_cropping_flag = 1;
+        seq_param.frame_crop_left_offset = 0;
+        seq_param.frame_crop_right_offset = (frame_width_mbaligned - frame_width)/2;
+        seq_param.frame_crop_top_offset = 0;
+        seq_param.frame_crop_bottom_offset = (frame_height_mbaligned - frame_height)/2;
+    }
+    
+    va_status = vaCreateBuffer(va_dpy, context_id,
+                               VAEncSequenceParameterBufferType,
+                               sizeof(seq_param), 1, &seq_param, &seq_param_buf);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+    
+    va_status = vaCreateBuffer(va_dpy, context_id,
+                               VAEncMiscParameterBufferType,
+                               sizeof(VAEncMiscParameterBuffer) + sizeof(VAEncMiscParameterRateControl),
+                               1, NULL, &rc_param_buf);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+    
+    vaMapBuffer(va_dpy, rc_param_buf, (void **)&misc_param);
+    misc_param->type = VAEncMiscParameterTypeRateControl;
+    misc_rate_ctrl = (VAEncMiscParameterRateControl *)misc_param->data;
+    memset(misc_rate_ctrl, 0, sizeof(*misc_rate_ctrl));
+    misc_rate_ctrl->bits_per_second = frame_bitrate;
+    misc_rate_ctrl->target_percentage = 66;
+    misc_rate_ctrl->window_size = 1000;
+    misc_rate_ctrl->initial_qp = initial_qp;
+    misc_rate_ctrl->min_qp = minimal_qp;
+    misc_rate_ctrl->basic_unit_size = 0;
+    vaUnmapBuffer(va_dpy, rc_param_buf);
+
+    render_id[0] = seq_param_buf;
+    render_id[1] = rc_param_buf;
+    
+    va_status = vaRenderPicture(va_dpy, context_id, &render_id[0], 2);
+    CHECK_VASTATUS(va_status, "vaRenderPicture");;
+
+    if (misc_priv_type != 0) {
+        va_status = vaCreateBuffer(va_dpy, context_id,
+                                   VAEncMiscParameterBufferType,
+                                   sizeof(VAEncMiscParameterBuffer),
+                                   1, NULL, &misc_param_tmpbuf);
+        CHECK_VASTATUS(va_status, "vaCreateBuffer");
+        vaMapBuffer(va_dpy, misc_param_tmpbuf, (void **)&misc_param_tmp);
+        misc_param_tmp->type = (VAEncMiscParameterType)misc_priv_type;
+        misc_param_tmp->data[0] = misc_priv_value;
+        vaUnmapBuffer(va_dpy, misc_param_tmpbuf);
+    
+        va_status = vaRenderPicture(va_dpy, context_id, &misc_param_tmpbuf, 1);
+    }
+    
+    return 0;
+}
+
+static int calc_poc(int pic_order_cnt_lsb)
+{
+    static int PicOrderCntMsb_ref = 0, pic_order_cnt_lsb_ref = 0;
+    int prevPicOrderCntMsb, prevPicOrderCntLsb;
+    int PicOrderCntMsb, TopFieldOrderCnt;
+    
+    if (current_frame_type == FRAME_IDR)
+        prevPicOrderCntMsb = prevPicOrderCntLsb = 0;
+    else {
+        prevPicOrderCntMsb = PicOrderCntMsb_ref;
+        prevPicOrderCntLsb = pic_order_cnt_lsb_ref;
+    }
+    
+    if ((pic_order_cnt_lsb < prevPicOrderCntLsb) &&
+        ((prevPicOrderCntLsb - pic_order_cnt_lsb) >= (int)(MaxPicOrderCntLsb / 2)))
+        PicOrderCntMsb = prevPicOrderCntMsb + MaxPicOrderCntLsb;
+    else if ((pic_order_cnt_lsb > prevPicOrderCntLsb) &&
+             ((pic_order_cnt_lsb - prevPicOrderCntLsb) > (int)(MaxPicOrderCntLsb / 2)))
+        PicOrderCntMsb = prevPicOrderCntMsb - MaxPicOrderCntLsb;
+    else
+        PicOrderCntMsb = prevPicOrderCntMsb;
+    
+    TopFieldOrderCnt = PicOrderCntMsb + pic_order_cnt_lsb;
+
+    if (current_frame_type != FRAME_B) {
+        PicOrderCntMsb_ref = PicOrderCntMsb;
+        pic_order_cnt_lsb_ref = pic_order_cnt_lsb;
+    }
+    
+    return TopFieldOrderCnt;
+}
+
+static int render_picture(void)
+{
+    VABufferID pic_param_buf;
+    VAStatus va_status;
+    int i = 0;
+
+    pic_param.CurrPic.picture_id = gl_surfaces[current_frame_display % SURFACE_NUM].ref_surface;
+    pic_param.CurrPic.frame_idx = current_frame_num;
+    pic_param.CurrPic.flags = 0;
+    pic_param.CurrPic.TopFieldOrderCnt = calc_poc((current_frame_display - current_IDR_display) % MaxPicOrderCntLsb);
+    pic_param.CurrPic.BottomFieldOrderCnt = pic_param.CurrPic.TopFieldOrderCnt;
+    CurrentCurrPic = pic_param.CurrPic;
+
+    if (getenv("TO_DEL")) { /* set RefPicList into ReferenceFrames */
+        update_RefPicList(); /* calc RefPicList */
+        memset(pic_param.ReferenceFrames, 0xff, 16 * sizeof(VAPictureH264)); /* invalid all */
+        if (current_frame_type == FRAME_P) {
+            pic_param.ReferenceFrames[0] = RefPicList0_P[0];
+        } else if (current_frame_type == FRAME_B) {
+            pic_param.ReferenceFrames[0] = RefPicList0_B[0];
+            pic_param.ReferenceFrames[1] = RefPicList1_B[0];
+        }
+    } else {
+        memcpy(pic_param.ReferenceFrames, ReferenceFrames, numShortTerm*sizeof(VAPictureH264));
+        for (i = numShortTerm; i < SURFACE_NUM; i++) {
+            pic_param.ReferenceFrames[i].picture_id = VA_INVALID_SURFACE;
+            pic_param.ReferenceFrames[i].flags = VA_PICTURE_H264_INVALID;
+        }
+    }
+    
+    pic_param.pic_fields.bits.idr_pic_flag = (current_frame_type == FRAME_IDR);
+    pic_param.pic_fields.bits.reference_pic_flag = (current_frame_type != FRAME_B);
+    pic_param.pic_fields.bits.entropy_coding_mode_flag = h264_entropy_mode;
+    pic_param.pic_fields.bits.deblocking_filter_control_present_flag = 1;
+    pic_param.frame_num = current_frame_num;
+    pic_param.coded_buf = gl_surfaces[current_frame_display % SURFACE_NUM].coded_buf;
+    pic_param.last_picture = false;  // FIXME
+    pic_param.pic_init_qp = initial_qp;
+
+    va_status = vaCreateBuffer(va_dpy, context_id, VAEncPictureParameterBufferType,
+                               sizeof(pic_param), 1, &pic_param, &pic_param_buf);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");;
+
+    va_status = vaRenderPicture(va_dpy, context_id, &pic_param_buf, 1);
+    CHECK_VASTATUS(va_status, "vaRenderPicture");
+
+    return 0;
+}
+
+static int render_packedsequence(void)
+{
+    VAEncPackedHeaderParameterBuffer packedheader_param_buffer;
+    VABufferID packedseq_para_bufid, packedseq_data_bufid, render_id[2];
+    unsigned int length_in_bits;
+    unsigned char *packedseq_buffer = NULL;
+    VAStatus va_status;
+
+    length_in_bits = build_packed_seq_buffer(&packedseq_buffer); 
+    
+    packedheader_param_buffer.type = VAEncPackedHeaderSequence;
+    
+    packedheader_param_buffer.bit_length = length_in_bits; /*length_in_bits*/
+    packedheader_param_buffer.has_emulation_bytes = 0;
+    va_status = vaCreateBuffer(va_dpy,
+                               context_id,
+                               VAEncPackedHeaderParameterBufferType,
+                               sizeof(packedheader_param_buffer), 1, &packedheader_param_buffer,
+                               &packedseq_para_bufid);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+
+    va_status = vaCreateBuffer(va_dpy,
+                               context_id,
+                               VAEncPackedHeaderDataBufferType,
+                               (length_in_bits + 7) / 8, 1, packedseq_buffer,
+                               &packedseq_data_bufid);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+
+    render_id[0] = packedseq_para_bufid;
+    render_id[1] = packedseq_data_bufid;
+    va_status = vaRenderPicture(va_dpy, context_id, render_id, 2);
+    CHECK_VASTATUS(va_status, "vaRenderPicture");
+
+    free(packedseq_buffer);
+    
+    return 0;
+}
+
+
+static int render_packedpicture(void)
+{
+    VAEncPackedHeaderParameterBuffer packedheader_param_buffer;
+    VABufferID packedpic_para_bufid, packedpic_data_bufid, render_id[2];
+    unsigned int length_in_bits;
+    unsigned char *packedpic_buffer = NULL;
+    VAStatus va_status;
+
+    length_in_bits = build_packed_pic_buffer(&packedpic_buffer); 
+    packedheader_param_buffer.type = VAEncPackedHeaderPicture;
+    packedheader_param_buffer.bit_length = length_in_bits;
+    packedheader_param_buffer.has_emulation_bytes = 0;
+
+    va_status = vaCreateBuffer(va_dpy,
+                               context_id,
+                               VAEncPackedHeaderParameterBufferType,
+                               sizeof(packedheader_param_buffer), 1, &packedheader_param_buffer,
+                               &packedpic_para_bufid);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+
+    va_status = vaCreateBuffer(va_dpy,
+                               context_id,
+                               VAEncPackedHeaderDataBufferType,
+                               (length_in_bits + 7) / 8, 1, packedpic_buffer,
+                               &packedpic_data_bufid);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+
+    render_id[0] = packedpic_para_bufid;
+    render_id[1] = packedpic_data_bufid;
+    va_status = vaRenderPicture(va_dpy, context_id, render_id, 2);
+    CHECK_VASTATUS(va_status, "vaRenderPicture");
+
+    free(packedpic_buffer);
+    
+    return 0;
+}
+
+static void render_packedslice()
+{
+    VAEncPackedHeaderParameterBuffer packedheader_param_buffer;
+    VABufferID packedslice_para_bufid, packedslice_data_bufid, render_id[2];
+    unsigned int length_in_bits;
+    unsigned char *packedslice_buffer = NULL;
+    VAStatus va_status;
+
+    length_in_bits = build_packed_slice_buffer(&packedslice_buffer);
+    packedheader_param_buffer.type = VAEncPackedHeaderSlice;
+    packedheader_param_buffer.bit_length = length_in_bits;
+    packedheader_param_buffer.has_emulation_bytes = 0;
+
+    va_status = vaCreateBuffer(va_dpy,
+                               context_id,
+                               VAEncPackedHeaderParameterBufferType,
+                               sizeof(packedheader_param_buffer), 1, &packedheader_param_buffer,
+                               &packedslice_para_bufid);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+
+    va_status = vaCreateBuffer(va_dpy,
+                               context_id,
+                               VAEncPackedHeaderDataBufferType,
+                               (length_in_bits + 7) / 8, 1, packedslice_buffer,
+                               &packedslice_data_bufid);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");
+
+    render_id[0] = packedslice_para_bufid;
+    render_id[1] = packedslice_data_bufid;
+    va_status = vaRenderPicture(va_dpy, context_id, render_id, 2);
+    CHECK_VASTATUS(va_status, "vaRenderPicture");
+
+    free(packedslice_buffer);
+}
+
+static int render_slice(void)
+{
+    VABufferID slice_param_buf;
+    VAStatus va_status;
+    int i;
+
+    update_RefPicList();
+    
+    /* one frame, one slice */
+    slice_param.macroblock_address = 0;
+    slice_param.num_macroblocks = frame_width_mbaligned * frame_height_mbaligned/(16*16); /* Measured by MB */
+    slice_param.slice_type = (current_frame_type == FRAME_IDR)?2:current_frame_type;
+    if (current_frame_type == FRAME_IDR) {
+        if (current_frame_encoding != 0)
+            ++slice_param.idr_pic_id;
+    } else if (current_frame_type == FRAME_P) {
+        int refpiclist0_max = h264_maxref & 0xffff;
+        memcpy(slice_param.RefPicList0, RefPicList0_P, refpiclist0_max*sizeof(VAPictureH264));
+
+        for (i = refpiclist0_max; i < 32; i++) {
+            slice_param.RefPicList0[i].picture_id = VA_INVALID_SURFACE;
+            slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID;
+        }
+    } else if (current_frame_type == FRAME_B) {
+        int refpiclist0_max = h264_maxref & 0xffff;
+        int refpiclist1_max = (h264_maxref >> 16) & 0xffff;
+
+        memcpy(slice_param.RefPicList0, RefPicList0_B, refpiclist0_max*sizeof(VAPictureH264));
+        for (i = refpiclist0_max; i < 32; i++) {
+            slice_param.RefPicList0[i].picture_id = VA_INVALID_SURFACE;
+            slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID;
+        }
+
+        memcpy(slice_param.RefPicList1, RefPicList1_B, refpiclist1_max*sizeof(VAPictureH264));
+        for (i = refpiclist1_max; i < 32; i++) {
+            slice_param.RefPicList1[i].picture_id = VA_INVALID_SURFACE;
+            slice_param.RefPicList1[i].flags = VA_PICTURE_H264_INVALID;
+        }
+    }
+
+    slice_param.slice_alpha_c0_offset_div2 = 0;
+    slice_param.slice_beta_offset_div2 = 0;
+    slice_param.direct_spatial_mv_pred_flag = 1;
+    slice_param.pic_order_cnt_lsb = (current_frame_display - current_IDR_display) % MaxPicOrderCntLsb;
+    
+
+    if (h264_packedheader &&
+        config_attrib[enc_packed_header_idx].value & VA_ENC_PACKED_HEADER_SLICE)
+        render_packedslice();
+
+    va_status = vaCreateBuffer(va_dpy, context_id, VAEncSliceParameterBufferType,
+                               sizeof(slice_param), 1, &slice_param, &slice_param_buf);
+    CHECK_VASTATUS(va_status, "vaCreateBuffer");;
+
+    va_status = vaRenderPicture(va_dpy, context_id, &slice_param_buf, 1);
+    CHECK_VASTATUS(va_status, "vaRenderPicture");
+    
+    return 0;
+}
+
+
+
+int H264Encoder::save_codeddata(unsigned long long display_order, unsigned long long encode_order, int frame_type)
+{    
+    VACodedBufferSegment *buf_list = NULL;
+    VAStatus va_status;
+    unsigned int coded_size = 0;
+
+    string data;
+
+    va_status = vaMapBuffer(va_dpy, gl_surfaces[display_order % SURFACE_NUM].coded_buf, (void **)(&buf_list));
+    CHECK_VASTATUS(va_status, "vaMapBuffer");
+    while (buf_list != NULL) {
+        data.append(reinterpret_cast<const char *>(buf_list->buf), buf_list->size);
+        if (coded_fp != nullptr)
+            coded_size += fwrite(buf_list->buf, 1, buf_list->size, coded_fp);
+        buf_list = (VACodedBufferSegment *) buf_list->next;
+
+        frame_size += coded_size;
+    }
+    vaUnmapBuffer(va_dpy, gl_surfaces[display_order % SURFACE_NUM].coded_buf);
+
+    AVPacket pkt;
+    memset(&pkt, 0, sizeof(pkt));
+    pkt.buf = nullptr;
+    pkt.pts = av_rescale_q(display_order, AVRational{1, frame_rate}, avstream->time_base);
+    pkt.dts = av_rescale_q(encode_order, AVRational{1, frame_rate}, avstream->time_base);
+    pkt.data = reinterpret_cast<uint8_t *>(&data[0]);
+    pkt.size = data.size();
+    pkt.stream_index = 0;
+    if (frame_type == FRAME_IDR || frame_type == FRAME_I) {
+        pkt.flags = AV_PKT_FLAG_KEY;
+    } else {
+        pkt.flags = 0;
+    }
+    pkt.duration = 1;
+    av_interleaved_write_frame(avctx, &pkt);
+
+#if 0
+    printf("\r      "); /* return back to startpoint */
+    switch (encode_order % 4) {
+        case 0:
+            printf("|");
+            break;
+        case 1:
+            printf("/");
+            break;
+        case 2:
+            printf("-");
+            break;
+        case 3:
+            printf("\\");
+            break;
+    }
+    printf("%08lld", encode_order);
+    printf("(%06d bytes coded)", coded_size);
+#endif
+
+    return 0;
+}
+
+
+// this is weird. but it seems to put a new frame onto the queue
+void H264Encoder::storage_task_enqueue(unsigned long long display_order, unsigned long long encode_order, int frame_type)
+{
+       std::unique_lock<std::mutex> lock(storage_task_queue_mutex);
+
+       storage_task tmp;
+       tmp.display_order = display_order;
+       tmp.encode_order = encode_order;
+       tmp.frame_type = frame_type;
+       storage_task_queue.push(tmp);
+       srcsurface_status[display_order % SURFACE_NUM] = SRC_SURFACE_IN_ENCODING;
+
+       storage_task_queue_changed.notify_all();
+}
+
+void H264Encoder::storage_task_thread()
+{
+       for ( ;; ) {
+               storage_task current;
+               {
+                       // wait until there's an encoded frame  
+                       std::unique_lock<std::mutex> lock(storage_task_queue_mutex);
+                       storage_task_queue_changed.wait(lock, [this]{ return storage_thread_should_quit || !storage_task_queue.empty(); });
+                       if (storage_thread_should_quit) return;
+                       current = storage_task_queue.front();
+                       storage_task_queue.pop();
+               }
+
+               VAStatus va_status;
+          
+               // waits for data, then saves it to disk.
+               va_status = vaSyncSurface(va_dpy, gl_surfaces[current.display_order % SURFACE_NUM].src_surface);
+               CHECK_VASTATUS(va_status, "vaSyncSurface");
+               save_codeddata(current.display_order, current.encode_order, current.frame_type);
+
+               {
+                       std::unique_lock<std::mutex> lock(storage_task_queue_mutex);
+                       srcsurface_status[current.display_order % SURFACE_NUM] = SRC_SURFACE_FREE;
+                       storage_task_queue_changed.notify_all();
+               }
+       }
+}
+
+static int release_encode()
+{
+    int i;
+    
+    for (i = 0; i < SURFACE_NUM; i++) {
+        vaDestroyBuffer(va_dpy, gl_surfaces[i].coded_buf);
+        vaDestroySurfaces(va_dpy, &gl_surfaces[i].src_surface, 1);
+        vaDestroySurfaces(va_dpy, &gl_surfaces[i].ref_surface, 1);
+    }
+    
+    vaDestroyContext(va_dpy, context_id);
+    vaDestroyConfig(va_dpy, config_id);
+
+    return 0;
+}
+
+static int deinit_va()
+{ 
+    vaTerminate(va_dpy);
+
+    va_close_display(va_dpy);
+
+    return 0;
+}
+
+
+static int print_input()
+{
+    printf("\n\nINPUT:Try to encode H264...\n");
+    if (rc_mode != -1)
+        printf("INPUT: RateControl  : %s\n", rc_to_string(rc_mode));
+    printf("INPUT: Resolution   : %dx%dframes\n", frame_width, frame_height);
+    printf("INPUT: FrameRate    : %d\n", frame_rate);
+    printf("INPUT: Bitrate      : %d\n", frame_bitrate);
+    printf("INPUT: Slieces      : %d\n", frame_slices);
+    printf("INPUT: IntraPeriod  : %d\n", intra_period);
+    printf("INPUT: IDRPeriod    : %d\n", intra_idr_period);
+    printf("INPUT: IpPeriod     : %d\n", ip_period);
+    printf("INPUT: Initial QP   : %d\n", initial_qp);
+    printf("INPUT: Min QP       : %d\n", minimal_qp);
+    printf("INPUT: Coded Clip   : %s\n", coded_fn);
+    
+    printf("\n\n"); /* return back to startpoint */
+    
+    return 0;
+}
+
+
+//H264Encoder::H264Encoder(SDL_Window *window, SDL_GLContext context, int width, int height, const char *output_filename) 
+H264Encoder::H264Encoder(QSurface *surface, int width, int height, const char *output_filename)
+       : current_storage_frame(0), surface(surface)
+       //: width(width), height(height), current_encoding_frame(0)
+{
+       av_register_all();
+       avctx = avformat_alloc_context();
+       avctx->oformat = av_guess_format(NULL, output_filename, NULL);
+       strcpy(avctx->filename, output_filename);
+       if (avio_open2(&avctx->pb, output_filename, AVIO_FLAG_WRITE, &avctx->interrupt_callback, NULL) < 0) {
+               fprintf(stderr, "%s: avio_open2() failed\n", output_filename);
+               exit(1);
+       }
+       AVCodec *codec = avcodec_find_encoder(AV_CODEC_ID_H264);
+       avstream = avformat_new_stream(avctx, codec);
+       if (avstream == nullptr) {
+               fprintf(stderr, "%s: avformat_new_stream() failed\n", output_filename);
+               exit(1);
+       }
+       avstream->time_base = AVRational{1, frame_rate};  // TODO
+       avstream->codec->width = width;
+       avstream->codec->height = height;
+       //avstream->codec->time_base = AVRational{1, 60};  // TODO
+       avstream->codec->time_base = AVRational{1, frame_rate};
+       avstream->codec->ticks_per_frame = 1;  // or 2?
+
+       if (avformat_write_header(avctx, NULL) < 0) {
+               fprintf(stderr, "%s: avformat_write_header() failed\n", output_filename);
+               exit(1);
+       }
+
+       coded_fp = fopen("dump.h264", "wb");
+       assert(coded_fp != NULL);
+
+       frame_width = width;
+       frame_height = height;
+       frame_width_mbaligned = (frame_width + 15) & (~15);
+       frame_height_mbaligned = (frame_height + 15) & (~15);
+        frame_bitrate = 15000000;  // / 60;
+       current_frame_encoding = 0;
+
+       print_input();
+
+       init_va();
+       setup_encode();
+
+       // No frames are ready yet.
+       memset(srcsurface_status, SRC_SURFACE_FREE, sizeof(srcsurface_status));
+           
+       memset(&seq_param, 0, sizeof(seq_param));
+       memset(&pic_param, 0, sizeof(pic_param));
+       memset(&slice_param, 0, sizeof(slice_param));
+
+       storage_thread = std::thread(&H264Encoder::storage_task_thread, this);
+
+       copy_thread = std::thread([this]{
+               //SDL_GL_MakeCurrent(window, context);
+               QOpenGLContext *context = create_context();
+               eglBindAPI(EGL_OPENGL_API);
+               if (!make_current(context, this->surface)) {
+                       printf("display=%p surface=%p context=%p curr=%p err=%d\n", eglGetCurrentDisplay(), this->surface, context, eglGetCurrentContext(),
+                               eglGetError());
+                       exit(1);
+               }
+               copy_thread_func();
+       });
+}
+
+H264Encoder::~H264Encoder()
+{
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               storage_thread_should_quit = true;
+               storage_task_queue_changed.notify_all();
+       }
+       {
+               unique_lock<mutex> lock(frame_queue_mutex);
+               copy_thread_should_quit = true;
+               frame_queue_nonempty.notify_one();
+       }
+       storage_thread.join();
+       copy_thread.join();
+
+       release_encode();
+       deinit_va();
+
+       av_write_trailer(avctx);
+       avformat_free_context(avctx);
+}
+
+bool H264Encoder::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
+{
+       {
+               // Wait until this frame slot is done encoding.
+               std::unique_lock<std::mutex> lock(storage_task_queue_mutex);
+               storage_task_queue_changed.wait(lock, [this]{ return storage_thread_should_quit || (srcsurface_status[current_storage_frame % SURFACE_NUM] == SRC_SURFACE_FREE); });
+               if (storage_thread_should_quit) return false;
+       }
+
+       //*fbo = fbos[current_storage_frame % SURFACE_NUM];
+       GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM];
+       *y_tex = surf->y_tex;
+       *cbcr_tex = surf->cbcr_tex;
+
+       VASurfaceID surface = surf->src_surface;
+        VAStatus va_status = vaDeriveImage(va_dpy, surface, &surf->surface_image);
+        CHECK_VASTATUS(va_status, "vaDeriveImage");
+
+       VABufferInfo buf_info;
+       buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;  // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM?
+       va_status = vaAcquireBufferHandle(va_dpy, surf->surface_image.buf, &buf_info);
+        CHECK_VASTATUS(va_status, "vaAcquireBufferHandle");
+
+       // Create Y image.
+       surf->y_egl_image = EGL_NO_IMAGE_KHR;
+       EGLint y_attribs[] = {
+               EGL_WIDTH, frame_width,
+               EGL_HEIGHT, frame_height,
+               EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '),
+               EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
+               EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]),
+               EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]),
+               EGL_NONE
+       };
+
+       surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs);
+       assert(surf->y_egl_image != EGL_NO_IMAGE_KHR);
+
+       // Associate Y image to a texture.
+       glBindTexture(GL_TEXTURE_2D, *y_tex);
+       glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image);
+
+       // Create CbCr image.
+       surf->cbcr_egl_image = EGL_NO_IMAGE_KHR;
+       EGLint cbcr_attribs[] = {
+               EGL_WIDTH, frame_width,
+               EGL_HEIGHT, frame_height,
+               EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'),
+               EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
+               EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]),
+               EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]),
+               EGL_NONE
+       };
+
+       surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs);
+       assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR);
+
+       // Associate CbCr image to a texture.
+       glBindTexture(GL_TEXTURE_2D, *cbcr_tex);
+       glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
+
+       return true;
+}
+
+void H264Encoder::end_frame(GLsync fence)
+{
+       {
+               unique_lock<mutex> lock(frame_queue_mutex);
+               pending_frames[current_storage_frame++] = fence;
+       }
+       frame_queue_nonempty.notify_one();
+}
+
+void H264Encoder::copy_thread_func()
+{
+       for ( ;; ) {
+               GLsync fence;
+               encoding2display_order(current_frame_encoding, intra_period, intra_idr_period, ip_period,
+                                      &current_frame_display, &current_frame_type);
+               if (current_frame_type == FRAME_IDR) {
+                       numShortTerm = 0;
+                       current_frame_num = 0;
+                       current_IDR_display = current_frame_display;
+               }
+
+               {
+                       unique_lock<mutex> lock(frame_queue_mutex);
+                       frame_queue_nonempty.wait(lock, [this]{ return copy_thread_should_quit || pending_frames.count(current_frame_display) != 0; });
+                       if (copy_thread_should_quit) return;
+                       fence = pending_frames[current_frame_display];
+                       pending_frames.erase(current_frame_display);
+               }
+
+               // Wait for the GPU to be done with the frame.
+               glClientWaitSync(fence, 0, 0);
+               glDeleteSync(fence);
+
+               // Unmap the image.
+               GLSurface *surf = &gl_surfaces[current_frame_display % SURFACE_NUM];
+               eglDestroyImageKHR(eglGetCurrentDisplay(), surf->y_egl_image);
+               eglDestroyImageKHR(eglGetCurrentDisplay(), surf->cbcr_egl_image);
+               VAStatus va_status = vaReleaseBufferHandle(va_dpy, surf->surface_image.buf);
+               CHECK_VASTATUS(va_status, "vaReleaseBufferHandle");
+               va_status = vaDestroyImage(va_dpy, surf->surface_image.image_id);
+               CHECK_VASTATUS(va_status, "vaDestroyImage");
+
+               VASurfaceID surface = surf->src_surface;
+
+               // Schedule the frame for encoding.
+               va_status = vaBeginPicture(va_dpy, context_id, surface);
+               CHECK_VASTATUS(va_status, "vaBeginPicture");
+
+               if (current_frame_type == FRAME_IDR) {
+                       render_sequence();
+                       render_picture();            
+                       if (h264_packedheader) {
+                               render_packedsequence();
+                               render_packedpicture();
+                       }
+               } else {
+                       //render_sequence();
+                       render_picture();
+               }
+               render_slice();
+               
+               va_status = vaEndPicture(va_dpy, context_id);
+               CHECK_VASTATUS(va_status, "vaEndPicture");
+
+               // so now the data is done encoding (well, async job kicked off)...
+               // we send that to the storage thread
+               storage_task_enqueue(current_frame_display, current_frame_encoding, current_frame_type);
+               
+               update_ReferenceFrames();
+               ++current_frame_encoding;
+       }
+}
diff --git a/h264encode.h b/h264encode.h
new file mode 100644 (file)
index 0000000..f355116
--- /dev/null
@@ -0,0 +1,105 @@
+// Hardware H.264 encoding via VAAPI. Heavily modified based on example
+// code by Intel. Intel's original copyright and license is reproduced below:
+//
+// Copyright (c) 2007-2013 Intel Corporation. All Rights Reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sub license, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+// 
+// The above copyright notice and this permission notice (including the
+// next paragraph) shall be included in all copies or substantial portions
+// of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+// IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+// ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H264ENCODE_H
+#define _H264ENCODE_H
+
+extern "C" {
+#include <libavformat/avformat.h>
+}
+#include <epoxy/egl.h>
+#include <atomic>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <thread>
+#include <thread>
+#include <thread>
+#include <condition_variable>
+
+#include "pbo_frame_allocator.h"
+#include "context.h"
+
+#define SURFACE_NUM 16 /* 16 surfaces for source YUV */
+
+class H264Encoder {
+public:
+       H264Encoder(QSurface *surface, int width, int height, const char *output_filename);
+       ~H264Encoder();
+       //void add_frame(FrameAllocator::Frame frame, GLsync fence);
+
+#if 0
+       struct Frame {
+       public:
+               GLuint fbo;
+               GLuint y_tex, cbcr_tex; 
+
+       private:
+               //int surface_subnum;
+       };
+       void 
+#endif
+       bool begin_frame(GLuint *y_tex, GLuint *cbcr_tex);
+       void end_frame(GLsync fence);
+
+private:
+       struct storage_task {
+               unsigned long long display_order;
+               unsigned long long encode_order;
+               int frame_type;
+       };
+
+       void copy_thread_func();
+       void storage_task_thread();
+       void storage_task_enqueue(unsigned long long display_order, unsigned long long encode_order, int frame_type);
+       int save_codeddata(unsigned long long display_order, unsigned long long encode_order, int frame_type);
+
+       std::thread copy_thread, storage_thread;
+
+       std::mutex storage_task_queue_mutex;
+       std::condition_variable storage_task_queue_changed;
+       int srcsurface_status[SURFACE_NUM];  // protected by storage_task_queue_mutex
+       std::queue<storage_task> storage_task_queue;  // protected by storage_task_queue_mutex
+       bool storage_thread_should_quit = false;  // protected by storage_task_queue_mutex
+
+       std::mutex frame_queue_mutex;
+       std::condition_variable frame_queue_nonempty;
+       bool copy_thread_should_quit = false;  // under frame_queue_mutex
+
+       //int frame_width, frame_height;
+       //int ;
+       int current_storage_frame;
+#if 0
+       std::map<int, std::pair<FrameAllocator::Frame, GLsync>> pending_frames;
+#endif
+       std::map<int, GLsync> pending_frames;
+       QSurface *surface;
+
+       AVFormatContext *avctx;
+       AVStream *avstream;
+};
+
+#endif
diff --git a/main.cpp b/main.cpp
new file mode 100644 (file)
index 0000000..34a1c49
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,28 @@
+#include <QApplication>
+#include <QDesktopWidget>
+#include <QSurfaceFormat>
+#include <QtGui/QOpenGLContext>
+
+#include "mainwindow.h"
+
+int main(int argc, char *argv[])
+{
+       setenv("QT_XCB_GL_INTEGRATION", "xcb_egl", 0);
+
+       QCoreApplication::setAttribute(Qt::AA_ShareOpenGLContexts, true);
+       QApplication app(argc, argv);
+
+       QSurfaceFormat fmt;
+       fmt.setDepthBufferSize(0);
+       fmt.setStencilBufferSize(0);
+       fmt.setProfile(QSurfaceFormat::CoreProfile);
+       fmt.setMajorVersion(3);
+       fmt.setMinorVersion(1);
+       QSurfaceFormat::setDefaultFormat(fmt);
+
+       MainWindow mainWindow;
+       mainWindow.resize(mainWindow.sizeHint());
+       mainWindow.show();
+
+       return app.exec();
+}
diff --git a/mainwindow.cpp b/mainwindow.cpp
new file mode 100644 (file)
index 0000000..4804c08
--- /dev/null
@@ -0,0 +1,13 @@
+#include "mainwindow.h"
+#include "window.h"
+#include <thread>
+
+#include "context.h"
+#include "mixer.h"
+
+using std::thread;
+
+MainWindow::MainWindow()
+{
+        setCentralWidget(new Window(this));
+}
diff --git a/mainwindow.h b/mainwindow.h
new file mode 100644 (file)
index 0000000..fd60640
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef MAINWINDOW_H
+#define MAINWINDOW_H
+
+#include <QMainWindow>
+
+class MainWindow : public QMainWindow
+{
+    Q_OBJECT
+
+public:
+    MainWindow();
+};
+
+#endif
diff --git a/mixer.cpp b/mixer.cpp
new file mode 100644 (file)
index 0000000..b8cdbe6
--- /dev/null
+++ b/mixer.cpp
@@ -0,0 +1,654 @@
+#define GL_GLEXT_PROTOTYPES 1
+#define NO_SDL_GLEXT 1
+#define NUM_CARDS 2
+
+#define WIDTH 1280
+#define HEIGHT 720
+
+#include <epoxy/gl.h>
+#include <epoxy/egl.h>
+
+#undef Success
+
+#include <assert.h>
+#include <features.h>
+#include <math.h>
+#include <png.h>
+#include <pngconf.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+#include <mutex>
+#include <queue>
+#include <condition_variable>
+
+#include <diffusion_effect.h>
+#include <effect.h>
+#include <effect_chain.h>
+#include <flat_input.h>
+#include <image_format.h>
+#include <init.h>
+#include <lift_gamma_gain_effect.h>
+#include <saturation_effect.h>
+#include <util.h>
+#include <ycbcr_input.h>
+#include <vignette_effect.h>
+#include <resample_effect.h>
+#include <resize_effect.h>
+#include <overlay_effect.h>
+#include <padding_effect.h>
+#include <white_balance_effect.h>
+#include <ycbcr.h>
+#include <resource_pool.h>
+#include <effect_util.h>
+
+#include <EGL/egl.h>
+
+#include "h264encode.h"
+#include "context.h"
+#include "bmusb.h"
+#include "pbo_frame_allocator.h"
+
+using namespace movit;
+using namespace std;
+using namespace std::placeholders;
+
+// shared between all EGL contexts
+EGLDisplay egl_display;
+EGLSurface egl_surface;
+EGLConfig ecfg;
+EGLint ctxattr[] = {
+       EGL_CONTEXT_CLIENT_VERSION, 2,
+       EGL_CONTEXT_MAJOR_VERSION_KHR, 3,
+       EGL_CONTEXT_MINOR_VERSION_KHR, 1,
+       //EGL_CONTEXT_OPENGL_PROFILE_MASK_KHR, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR,
+       EGL_CONTEXT_OPENGL_PROFILE_MASK_KHR, EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT,
+       EGL_NONE
+};
+
+EGLConfig pbuffer_ecfg;
+
+std::mutex bmusb_mutex;  // protects <cards>
+
+struct CaptureCard {
+       BMUSBCapture *usb;
+
+       // Threading stuff
+       bool thread_initialized;
+       QSurface *surface;
+       QOpenGLContext *context;
+
+       bool new_data_ready;  // Whether new_frame contains anything.
+       PBOFrameAllocator::Frame new_frame;
+       GLsync new_data_ready_fence;  // Whether new_frame is ready for rendering.
+       std::condition_variable new_data_ready_changed;  // Set whenever new_data_ready is changed.
+};
+CaptureCard cards[NUM_CARDS];
+
+void bm_frame(int card_index, uint16_t timecode,
+              FrameAllocator::Frame video_frame, size_t video_offset, uint16_t video_format,
+             FrameAllocator::Frame audio_frame, size_t audio_offset, uint16_t audio_format)
+{
+       CaptureCard *card = &cards[card_index];
+       if (!card->thread_initialized) {
+               printf("initializing context for bmusb thread %d\n", card_index);
+               eglBindAPI(EGL_OPENGL_API);
+               card->context = create_context();
+               if (!make_current(card->context, card->surface)) {
+                       printf("failed to create bmusb context\n");
+                       exit(1);
+               }
+               card->thread_initialized = true;
+       }       
+
+       if (video_frame.len - video_offset != 1280 * 750 * 2) {
+               printf("dropping frame with wrong length (%ld)\n", video_frame.len - video_offset);
+               FILE *fp = fopen("frame.raw", "wb");
+               fwrite(video_frame.data, video_frame.len, 1, fp);
+               fclose(fp);
+               //exit(1);
+               card->usb->get_video_frame_allocator()->release_frame(video_frame);
+               card->usb->get_audio_frame_allocator()->release_frame(audio_frame);
+               return;
+       }
+       {
+               // Wait until the previous frame was consumed.
+               std::unique_lock<std::mutex> lock(bmusb_mutex);
+               card->new_data_ready_changed.wait(lock, [card]{ return !card->new_data_ready; });
+       }
+       GLuint pbo = (GLint)(intptr_t)video_frame.userdata;
+       check_error();
+       glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+       check_error();
+       glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, video_frame.size);
+       check_error();
+       //glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
+       //check_error();
+       GLsync fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);              
+       check_error();
+       assert(fence != nullptr);
+       {
+               std::unique_lock<std::mutex> lock(bmusb_mutex);
+               card->new_data_ready = true;
+               card->new_frame = video_frame;
+               card->new_data_ready_fence = fence;
+               card->new_data_ready_changed.notify_all();
+       }
+
+       // Video frame will be released later.
+        card->usb->get_audio_frame_allocator()->release_frame(audio_frame);
+}
+       
+void place_rectangle(Effect *resample_effect, Effect *padding_effect, float x0, float y0, float x1, float y1)
+{
+       float srcx0 = 0.0f;
+       float srcx1 = 1.0f;
+       float srcy0 = 0.0f;
+       float srcy1 = 1.0f;
+
+       // Cull.
+       if (x0 > 1280.0 || x1 < 0.0 || y0 > 720.0 || y1 < 0.0) {
+               CHECK(resample_effect->set_int("width", 1));
+               CHECK(resample_effect->set_int("height", 1));
+               CHECK(resample_effect->set_float("zoom_x", 1280.0));
+               CHECK(resample_effect->set_float("zoom_y", 720.0));
+               CHECK(padding_effect->set_int("left", 2000));
+               CHECK(padding_effect->set_int("top", 2000));
+               return; 
+       }
+
+       // Clip. (TODO: Clip on upper/left sides, too.)
+       if (x1 > 1280.0) {
+               srcx1 = (1280.0 - x0) / (x1 - x0);
+               x1 = 1280.0;
+       }
+       if (y1 > 720.0) {
+               srcy1 = (720.0 - y0) / (y1 - y0);
+               y1 = 720.0;
+       }
+
+       float x_subpixel_offset = x0 - floor(x0);
+       float y_subpixel_offset = y0 - floor(y0);
+
+       // Resampling must be to an integral number of pixels. Round up,
+       // and then add an extra pixel so we have some leeway for the border.
+       int width = int(ceil(x1 - x0)) + 1;
+       int height = int(ceil(y1 - y0)) + 1;
+       CHECK(resample_effect->set_int("width", width));
+       CHECK(resample_effect->set_int("height", height));
+
+       // Correct the discrepancy with zoom. (This will leave a small
+       // excess edge of pixels and subpixels, which we'll correct for soon.)
+       float zoom_x = (x1 - x0) / (width * (srcx1 - srcx0));
+       float zoom_y = (y1 - y0) / (height * (srcy1 - srcy0));
+       CHECK(resample_effect->set_float("zoom_x", zoom_x));
+       CHECK(resample_effect->set_float("zoom_y", zoom_y));
+       CHECK(resample_effect->set_float("zoom_center_x", 0.0f));
+       CHECK(resample_effect->set_float("zoom_center_y", 0.0f));
+
+       // Padding must also be to a whole-pixel offset.
+       CHECK(padding_effect->set_int("left", floor(x0)));
+       CHECK(padding_effect->set_int("top", floor(y0)));
+
+       // Correct _that_ discrepancy by subpixel offset in the resampling.
+       CHECK(resample_effect->set_float("left", -x_subpixel_offset / zoom_x));
+       CHECK(resample_effect->set_float("top", -y_subpixel_offset / zoom_y));
+
+       // Finally, adjust the border so it is exactly where we want it.
+       CHECK(padding_effect->set_float("border_offset_left", x_subpixel_offset));
+       CHECK(padding_effect->set_float("border_offset_right", x1 - (floor(x0) + width)));
+       CHECK(padding_effect->set_float("border_offset_top", y_subpixel_offset));
+       CHECK(padding_effect->set_float("border_offset_bottom", y1 - (floor(y0) + height)));
+}
+
+void mixer_thread(QSurface *surface, QSurface *surface2, QSurface *surface3, QSurface *surface4)
+{
+       bool quit = false;
+
+       cards[0].surface = surface3;
+#if NUM_CARDS == 2
+       cards[1].surface = surface4;
+#endif
+
+       eglBindAPI(EGL_OPENGL_API);
+       //QSurface *surface = create_surface();
+       QOpenGLContext *context = create_context();
+       if (!make_current(context, surface)) {
+               printf("oops\n");
+               exit(1);
+       }
+       printf("egl=%p\n", eglGetCurrentContext());
+
+       CHECK(init_movit("/usr/share/movit", MOVIT_DEBUG_ON));
+       printf("GPU texture subpixel precision: about %.1f bits\n",
+               log2(1.0f / movit_texel_subpixel_precision));
+       printf("Wrongly rounded x+0.48 or x+0.52 values: %d/510\n",
+               movit_num_wrongly_rounded);
+       if (movit_num_wrongly_rounded > 0) {
+               if (movit_shader_rounding_supported) {
+                       printf("Rounding off in the shader to compensate.\n");
+               } else {
+                       printf("No shader roundoff available; cannot compensate.\n");
+               }
+       }
+
+       //printf("egl_api=%d EGL_OPENGL_API=%d\n", epoxy_egl_get_current_gl_context_api(), EGL_OPENGL_API);
+       //exit(0);
+
+       check_error();
+
+       EffectChain chain(WIDTH, HEIGHT);
+       check_error();
+#if 0
+       glViewport(0, 0, WIDTH, HEIGHT);
+       check_error();
+
+       glMatrixMode(GL_PROJECTION);
+       check_error();
+       glLoadIdentity();
+       check_error();
+       glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+       check_error();
+
+       glMatrixMode(GL_MODELVIEW);
+       glLoadIdentity();
+#endif
+
+       ImageFormat inout_format;
+       inout_format.color_space = COLORSPACE_sRGB;
+       inout_format.gamma_curve = GAMMA_sRGB;
+
+       YCbCrFormat ycbcr_format;
+       ycbcr_format.chroma_subsampling_x = 2;
+       ycbcr_format.chroma_subsampling_y = 1;
+       ycbcr_format.cb_x_position = 0.0;
+       ycbcr_format.cr_x_position = 0.0;
+       ycbcr_format.cb_y_position = 0.5;
+       ycbcr_format.cr_y_position = 0.5;
+       ycbcr_format.luma_coefficients = YCBCR_REC_601;
+       ycbcr_format.full_range = false;
+
+       YCbCrInput *input[NUM_CARDS];
+
+       input[0] = new YCbCrInput(inout_format, ycbcr_format, WIDTH, HEIGHT, YCBCR_INPUT_SPLIT_Y_AND_CBCR);
+       chain.add_input(input[0]);
+       //if (NUM_CARDS == 2) {
+               input[1] = new YCbCrInput(inout_format, ycbcr_format, WIDTH, HEIGHT, YCBCR_INPUT_SPLIT_Y_AND_CBCR);
+               chain.add_input(input[1]);
+       //}
+       //YCbCr422InterleavedInput *input = new YCbCr422InterleavedInput(inout_format, ycbcr_format, WIDTH, HEIGHT);
+       //YCbCr422InterleavedInput *input = new YCbCr422InterleavedInput(inout_format, ycbcr_format, 2, 1);
+       Effect *resample_effect = chain.add_effect(new ResampleEffect(), input[0]);
+       Effect *padding_effect = chain.add_effect(new IntegralPaddingEffect());
+       float border_color[] = { 0.0f, 0.0f, 0.0f, 1.0f };
+       CHECK(padding_effect->set_vec4("border_color", border_color));
+
+       //Effect *resample2_effect = chain.add_effect(new ResampleEffect(), input[1 % NUM_CARDS]);
+       Effect *resample2_effect = chain.add_effect(new ResampleEffect(), input[1]);
+       Effect *saturation_effect = chain.add_effect(new SaturationEffect());
+       CHECK(saturation_effect->set_float("saturation", 0.3f));
+       Effect *wb_effect = chain.add_effect(new WhiteBalanceEffect());
+       CHECK(wb_effect->set_float("output_color_temperature", 3500.0));
+       Effect *padding2_effect = chain.add_effect(new IntegralPaddingEffect());
+
+       chain.add_effect(new OverlayEffect(), padding_effect, padding2_effect);
+
+       ycbcr_format.chroma_subsampling_x = 1;
+
+       chain.add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR);
+       chain.set_dither_bits(8);
+       chain.set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
+       chain.finalize();
+
+#if 0
+       // generate a PBO to hold the data we read back with glReadPixels()
+       // (Intel/DRI goes into a slow path if we don't read to PBO)
+       GLuint pbo;
+       glGenBuffers(1, &pbo);
+       glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo);
+       glBufferData(GL_PIXEL_PACK_BUFFER_ARB, WIDTH * HEIGHT * 4, NULL, GL_STREAM_READ);
+#endif
+
+       //make_hsv_wheel_texture();
+
+//     QSurface *create_surface(const QSurfaceFormat &format);
+       H264Encoder h264_encoder(surface2, WIDTH, HEIGHT, "test.mp4");
+
+       printf("Configuring first card...\n");
+       cards[0].usb = new BMUSBCapture(0x1edb, 0xbd3b);  // 0xbd4f
+       //cards[0].usb = new BMUSBCapture(0x1edb, 0xbd4f);
+       cards[0].usb->set_frame_callback(std::bind(bm_frame, 0, _1, _2, _3, _4, _5, _6, _7));
+       std::unique_ptr<PBOFrameAllocator> pbo_allocator1(new PBOFrameAllocator(1280 * 750 * 2 + 44));
+       cards[0].usb->set_video_frame_allocator(pbo_allocator1.get());
+       cards[0].usb->configure_card();
+
+       std::unique_ptr<PBOFrameAllocator> pbo_allocator2(new PBOFrameAllocator(1280 * 750 * 2 + 44));
+       if (NUM_CARDS == 2) {
+               printf("Configuring second card...\n");
+               cards[1].usb = new BMUSBCapture(0x1edb, 0xbd4f);
+               cards[1].usb->set_frame_callback(std::bind(bm_frame, 1, _1, _2, _3, _4, _5, _6, _7));
+               cards[1].usb->set_video_frame_allocator(pbo_allocator2.get());
+               cards[1].usb->configure_card();
+       }
+
+       BMUSBCapture::start_bm_thread();
+
+       for (int card_index = 0; card_index < NUM_CARDS; ++card_index) {
+               cards[card_index].usb->start_bm_capture();
+       }
+
+       int frame = 0;
+#if _POSIX_C_SOURCE >= 199309L
+       struct timespec start, now;
+       clock_gettime(CLOCK_MONOTONIC, &start);
+#else
+       struct timeval start, now;
+       gettimeofday(&start, NULL);
+#endif
+
+       PBOFrameAllocator::Frame bmusb_current_rendering_frame[NUM_CARDS];
+       for (int card_index = 0; card_index < NUM_CARDS; ++card_index) {
+               bmusb_current_rendering_frame[card_index] =
+                       cards[card_index].usb->get_video_frame_allocator()->alloc_frame();
+               GLint input_tex_pbo = (GLint)(intptr_t)bmusb_current_rendering_frame[card_index].userdata;
+               input[card_index]->set_pixel_data(0, nullptr, input_tex_pbo);
+               input[card_index]->set_pixel_data(1, nullptr, input_tex_pbo);
+               check_error();
+       }
+
+       //chain.enable_phase_timing(true);
+
+       // Set up stuff for NV12 conversion.
+#if 0
+       PBOFrameAllocator nv12_frame_pool(WIDTH * HEIGHT * 3 / 2, /*num_frames=*/24,
+               GL_PIXEL_PACK_BUFFER_ARB, GL_MAP_READ_BIT | GL_MAP_COHERENT_BIT, 0);
+#endif
+       ResourcePool *resource_pool = chain.get_resource_pool();
+       //GLuint ycbcr_tex = resource_pool->create_2d_texture(GL_RGBA8, WIDTH, HEIGHT);
+       GLuint chroma_tex = resource_pool->create_2d_texture(GL_RG8, WIDTH, HEIGHT);
+
+#if 0
+       // Y shader.
+       string y_vert_shader = read_version_dependent_file("vs-y", "vert");
+       string y_frag_shader =
+               "#version 130 \n"
+               "in vec2 tc; \n"
+               "uniform sampler2D ycbcr_tex; \n"
+               "void main() { \n"
+               "    gl_FragColor = texture2D(ycbcr_tex, tc); \n"
+               "} \n";
+       GLuint y_program_num = resource_pool->compile_glsl_program(y_vert_shader, y_frag_shader);
+#endif
+
+#if 1
+       // Cb/Cr shader.
+       string cbcr_vert_shader = read_version_dependent_file("vs-cbcr", "vert");
+       string cbcr_frag_shader =
+               "#version 130 \n"
+               "in vec2 tc0; \n"
+//             "in vec2 tc1; \n"
+               "uniform sampler2D cbcr_tex; \n"
+               "void main() { \n"
+               "    gl_FragColor = texture2D(cbcr_tex, tc0); \n"
+//             "    gl_FragColor.ba = texture2D(cbcr_tex, tc1).gb; \n"
+               "} \n";
+       GLuint cbcr_program_num = resource_pool->compile_glsl_program(cbcr_vert_shader, cbcr_frag_shader);
+#endif
+
+       GLuint vao;
+       glGenVertexArrays(1, &vao);
+       check_error();
+
+       while (!quit) {
+               ++frame;
+
+               //int width0 = lrintf(848 * (1.0 + 0.2 * sin(frame * 0.02)));
+               int width0 = 848;
+               int height0 = lrintf(width0 * 9.0 / 16.0);
+
+               //float top0 = 96 + 48 * sin(frame * 0.005);
+               //float left0 = 96 + 48 * cos(frame * 0.006);
+               float top0 = 48;
+               float left0 = 16;
+               float bottom0 = top0 + height0;
+               float right0 = left0 + width0;
+
+               int width1 = 384;
+               int height1 = 216;
+       
+               float bottom1 = 720 - 48;
+               float right1 = 1280 - 16;
+               float top1 = bottom1 - height1;
+               float left1 = right1 - width1;
+               
+               float t = 0.5 + 0.5 * cos(frame * 0.006);
+               //float t = 0.0;
+               float scale0 = 1.0 + t * (1280.0 / 848.0 - 1.0);
+               float tx0 = 0.0 + t * (-16.0 * scale0);
+               float ty0 = 0.0 + t * (-48.0 * scale0);
+
+               top0 = top0 * scale0 + ty0;
+               bottom0 = bottom0 * scale0 + ty0;
+               left0 = left0 * scale0 + tx0;
+               right0 = right0 * scale0 + tx0;
+
+               top1 = top1 * scale0 + ty0;
+               bottom1 = bottom1 * scale0 + ty0;
+               left1 = left1 * scale0 + tx0;
+               right1 = right1 * scale0 + tx0;
+
+               place_rectangle(resample_effect, padding_effect, left0, top0, right0, bottom0);
+               place_rectangle(resample2_effect, padding2_effect, left1, top1, right1, bottom1);
+
+               CaptureCard card_copy[NUM_CARDS];
+
+               {
+                       std::unique_lock<std::mutex> lock(bmusb_mutex);
+
+                       // The first card is the master timer, so wait for it to have a new frame.
+                       // TODO: Make configurable, and with a timeout.
+                       cards[0].new_data_ready_changed.wait(lock, []{ return cards[0].new_data_ready; });
+
+                       for (int card_index = 0; card_index < NUM_CARDS; ++card_index) {
+                               CaptureCard *card = &cards[card_index];
+                               card_copy[card_index].usb = card->usb;
+                               card_copy[card_index].new_data_ready = card->new_data_ready;
+                               card_copy[card_index].new_frame = card->new_frame;
+                               card_copy[card_index].new_data_ready_fence = card->new_data_ready_fence;
+                               card->new_data_ready = false;
+                               card->new_data_ready_changed.notify_all();
+                       }
+               }
+       
+               for (int card_index = 0; card_index < NUM_CARDS; ++card_index) {
+                       CaptureCard *card = &card_copy[card_index];
+                       if (!card->new_data_ready)
+                               continue;
+
+                       // FIXME: We could still be rendering from it! 
+                       card->usb->get_video_frame_allocator()->release_frame(bmusb_current_rendering_frame[card_index]);
+                       bmusb_current_rendering_frame[card_index] = card->new_frame;
+
+                       // The new texture might still be uploaded,
+                       // tell the GPU to wait until it's there.
+                       if (card->new_data_ready_fence)
+                               glWaitSync(card->new_data_ready_fence, /*flags=*/0, GL_TIMEOUT_IGNORED);
+                       check_error();
+                       glDeleteSync(card->new_data_ready_fence);
+                       check_error();
+                       GLint input_tex_pbo = (GLint)(intptr_t)bmusb_current_rendering_frame[card_index].userdata;
+                       input[card_index]->set_pixel_data(0, (unsigned char *)BUFFER_OFFSET((1280 * 750 * 2 + 44) / 2 + 1280 * 25 + 22), input_tex_pbo);
+                       input[card_index]->set_pixel_data(1, (unsigned char *)BUFFER_OFFSET(1280 * 25 + 22), input_tex_pbo);
+
+                       if (NUM_CARDS == 1) {
+                               // Set to the other one, too.
+                               input[1]->set_pixel_data(0, (unsigned char *)BUFFER_OFFSET((1280 * 750 * 2 + 44) / 2 + 1280 * 25 + 22), input_tex_pbo);
+                               input[1]->set_pixel_data(1, (unsigned char *)BUFFER_OFFSET(1280 * 25 + 22), input_tex_pbo);
+                       }
+               }
+
+               GLuint y_tex, cbcr_tex;
+               bool got_frame = h264_encoder.begin_frame(&y_tex, &cbcr_tex);
+               assert(got_frame);
+
+               // Render chain.
+               {
+                       GLuint ycbcr_fbo = resource_pool->create_fbo(y_tex, chroma_tex);
+                       chain.render_to_fbo(ycbcr_fbo, WIDTH, HEIGHT);
+                       resource_pool->release_fbo(ycbcr_fbo);
+               }
+               if (false) {
+                       glViewport(0, 0, WIDTH, HEIGHT);
+                       chain.render_to_screen();
+               }
+
+#if 0
+               PBOFrameAllocator::Frame nv12_frame = nv12_frame_pool.alloc_frame();
+               assert(nv12_frame.data != nullptr);  // should never happen... maybe?
+               GLuint pbo = (GLuint)(intptr_t)nv12_frame.userdata;
+               glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo);
+               check_error();
+#endif
+
+               // Set up for extraction.
+               float vertices[] = {
+                       0.0f, 2.0f,
+                       0.0f, 0.0f,
+                       2.0f, 0.0f
+               };
+
+               glBindVertexArray(vao);
+               check_error();
+
+#if 0
+               // Extract Y.
+               GLuint y_fbo = resource_pool->create_fbo(y_tex);
+               glBindFramebuffer(GL_FRAMEBUFFER, y_fbo);
+               glViewport(0, 0, WIDTH, HEIGHT);
+               check_error();
+               {
+                       glUseProgram(y_program_num);
+                       check_error();
+                       glActiveTexture(GL_TEXTURE0);
+                       check_error();
+                       glBindTexture(GL_TEXTURE_2D, ycbcr_tex);
+                       check_error();
+                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+                       check_error();
+                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+                       check_error();
+                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+                       check_error();
+
+                       GLuint position_vbo = fill_vertex_attribute(y_program_num, "position", 2, GL_FLOAT, sizeof(vertices), vertices);
+                       GLuint texcoord_vbo = fill_vertex_attribute(y_program_num, "texcoord", 2, GL_FLOAT, sizeof(vertices), vertices);  // Same as vertices.
+
+                       glDrawArrays(GL_TRIANGLES, 0, 3);
+                       check_error();
+
+                       cleanup_vertex_attribute(y_program_num, "position", position_vbo);
+                       check_error();
+                       cleanup_vertex_attribute(y_program_num, "texcoord", texcoord_vbo);
+                       check_error();
+
+                       resource_pool->release_fbo(y_fbo);
+               }
+#endif
+
+               // Extract Cb/Cr.
+               GLuint cbcr_fbo = resource_pool->create_fbo(cbcr_tex);
+               glBindFramebuffer(GL_FRAMEBUFFER, cbcr_fbo);
+               glViewport(0, 0, WIDTH/2, HEIGHT/2);
+               check_error();
+               GLsync fence;
+               {
+                       glUseProgram(cbcr_program_num);
+                       check_error();
+
+                       glActiveTexture(GL_TEXTURE0);
+                       check_error();
+                       glBindTexture(GL_TEXTURE_2D, chroma_tex);
+                       check_error();
+                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+                       check_error();
+                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+                       check_error();
+                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+                       check_error();
+
+                       float chroma_offset_0[] = { -0.5f / WIDTH, 0.0f };
+//                     float chroma_offset_1[] = { +0.5f / WIDTH, 0.0f };
+                       set_uniform_vec2(cbcr_program_num, "foo", "chroma_offset_0", chroma_offset_0);
+//                     set_uniform_vec2(cbcr_program_num, "foo", "chroma_offset_1", chroma_offset_1);
+
+                       GLuint position_vbo = fill_vertex_attribute(cbcr_program_num, "position", 2, GL_FLOAT, sizeof(vertices), vertices);
+                       GLuint texcoord_vbo = fill_vertex_attribute(cbcr_program_num, "texcoord", 2, GL_FLOAT, sizeof(vertices), vertices);  // Same as vertices.
+
+                       glDrawArrays(GL_TRIANGLES, 0, 3);
+                       check_error();
+
+                       cleanup_vertex_attribute(cbcr_program_num, "position", position_vbo);
+                       cleanup_vertex_attribute(cbcr_program_num, "texcoord", texcoord_vbo);
+
+                       glUseProgram(0);
+                       check_error();
+
+#if 0  
+                       glReadPixels(0, 0, WIDTH/4, HEIGHT/2, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(WIDTH * HEIGHT));
+                       check_error();
+#endif
+
+                       fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);              
+                       check_error();
+
+                       resource_pool->release_fbo(cbcr_fbo);
+               }
+
+#if 0
+               glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
+#endif
+
+#if 1
+               h264_encoder.end_frame(fence);
+#else
+               nv12_frame_pool.release_frame(nv12_frame);
+#endif
+
+               //eglSwapBuffers(egl_display, egl_surface);
+
+
+#if 1
+#if _POSIX_C_SOURCE >= 199309L
+               clock_gettime(CLOCK_MONOTONIC, &now);
+               double elapsed = now.tv_sec - start.tv_sec +
+                       1e-9 * (now.tv_nsec - start.tv_nsec);
+#else
+               gettimeofday(&now, NULL);
+               double elapsed = now.tv_sec - start.tv_sec +
+                       1e-6 * (now.tv_usec - start.tv_usec);
+#endif
+               if (frame % 100 == 0) {
+                       printf("%d frames in %.3f seconds = %.1f fps (%.1f ms/frame)\n",
+                               frame, elapsed, frame / elapsed,
+                               1e3 * elapsed / frame);
+               //      chain.print_phase_timing();
+               }
+
+               // Reset every 100 frames, so that local variations in frame times
+               // (especially for the first few frames, when the shaders are
+               // compiled etc.) don't make it hard to measure for the entire
+               // remaining duration of the program.
+               if (frame == 10000) {
+                       frame = 0;
+                       start = now;
+               }
+#endif
+       }
+       glDeleteVertexArrays(1, &vao);
+       //resource_pool->release_glsl_program(y_program_num);
+       resource_pool->release_glsl_program(cbcr_program_num);
+       resource_pool->release_2d_texture(chroma_tex);
+       BMUSBCapture::stop_bm_thread();
+}
diff --git a/mixer.h b/mixer.h
new file mode 100644 (file)
index 0000000..ef7701f
--- /dev/null
+++ b/mixer.h
@@ -0,0 +1,2 @@
+class QSurface;
+void mixer_thread(QSurface *surface, QSurface *surface2, QSurface *surface3, QSurface *surface4);
diff --git a/pbo_frame_allocator.cpp b/pbo_frame_allocator.cpp
new file mode 100644 (file)
index 0000000..59013bf
--- /dev/null
@@ -0,0 +1,70 @@
+#include "pbo_frame_allocator.h"
+#include "util.h"
+
+using namespace std;
+
+PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
+        : frame_size(frame_size), buffer(buffer)
+{
+       for (size_t i = 0; i < num_queued_frames; ++i) {
+               GLuint pbo;
+               glGenBuffers(1, &pbo);
+               check_error();
+               glBindBuffer(buffer, pbo);
+               check_error();
+               glBufferStorage(buffer, frame_size, NULL, permissions | GL_MAP_PERSISTENT_BIT);
+               check_error();
+
+               Frame frame;
+               frame.data = (uint8_t *)glMapBufferRange(buffer, 0, frame_size, permissions | map_bits | GL_MAP_PERSISTENT_BIT);
+               frame.data2 = frame.data + frame_size / 2;
+               check_error();
+               frame.size = frame_size;
+               frame.userdata = (void *)(intptr_t)pbo;
+               frame.owner = this;
+               frame.interleaved = true;
+               freelist.push(frame);
+       }
+       glBindBuffer(buffer, 0);
+       check_error();
+}
+
+PBOFrameAllocator::~PBOFrameAllocator()
+{
+       while (!freelist.empty()) {
+               Frame frame = freelist.front();
+               freelist.pop();
+               GLuint pbo = (intptr_t)frame.userdata;
+               glBindBuffer(buffer, pbo);
+               check_error();
+               glUnmapBuffer(buffer);
+               check_error();
+               glBindBuffer(buffer, 0);
+               check_error();
+               glDeleteBuffers(1, &pbo);
+       }
+}
+//static int sumsum = 0;
+
+FrameAllocator::Frame PBOFrameAllocator::alloc_frame()
+{
+        Frame vf;
+
+       std::unique_lock<std::mutex> lock(freelist_mutex);  // Meh.
+       if (freelist.empty()) {
+               printf("Frame overrun (no more spare PBO frames), dropping frame!\n");
+       } else {
+               //fprintf(stderr, "freelist has %d allocated\n", ++sumsum);
+               vf = freelist.front();
+               freelist.pop();  // Meh.
+       }
+       vf.len = 0;
+       return vf;
+}
+
+void PBOFrameAllocator::release_frame(Frame frame)
+{
+       std::unique_lock<std::mutex> lock(freelist_mutex);
+       freelist.push(frame);
+       //--sumsum;
+}
diff --git a/pbo_frame_allocator.h b/pbo_frame_allocator.h
new file mode 100644 (file)
index 0000000..1155d20
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef _PBO_FRAME_ALLOCATOR 
+#define _PBO_FRAME_ALLOCATOR 1
+
+#include <mutex>
+#include <queue>
+#include <epoxy/gl.h>
+
+#include "bmusb.h"
+
+// An allocator that allocates straight into OpenGL pinned memory.
+// Meant for video frames only. We use a queue rather than a stack,
+// since we want to maximize pipelineability.
+class PBOFrameAllocator : public FrameAllocator {
+public:
+       // Note: You need to have an OpenGL context when calling
+       // the constructor.
+       PBOFrameAllocator(size_t frame_size,
+                         size_t num_queued_frames = 16,  // FIXME: should be 6
+                         GLenum buffer = GL_PIXEL_UNPACK_BUFFER_ARB,
+                         GLenum permissions = GL_MAP_WRITE_BIT,
+                         GLenum map_bits = GL_MAP_FLUSH_EXPLICIT_BIT);
+       ~PBOFrameAllocator() override;
+        Frame alloc_frame() override;
+        void release_frame(Frame frame) override;
+
+private:
+       size_t frame_size;
+
+       std::mutex freelist_mutex;
+       std::queue<Frame> freelist;
+       GLenum buffer;
+};
+
+#endif  // !defined(_PBO_FRAME_ALLOCATOR)
diff --git a/vs-cbcr.130.vert b/vs-cbcr.130.vert
new file mode 100644 (file)
index 0000000..3c639ce
--- /dev/null
@@ -0,0 +1,23 @@
+#version 130
+
+in vec2 position;
+in vec2 texcoord;
+out vec2 tc0;
+//out vec2 tc1;
+uniform vec2 foo_chroma_offset_0;
+//uniform vec2 foo_chroma_offset_1;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       vec2 flipped_tc = texcoord;
+//     flipped_tc.y = 1.0 - flipped_tc.y;
+       tc0 = flipped_tc + foo_chroma_offset_0;
+//     tc1 = flipped_tc + foo_chroma_offset_1;
+}
diff --git a/window.cpp b/window.cpp
new file mode 100644 (file)
index 0000000..b8eced7
--- /dev/null
@@ -0,0 +1,23 @@
+#include "glwidget.h"
+#include "window.h"
+#include "mainwindow.h"
+#include <QApplication>
+#include <QBoxLayout>
+
+Window::Window(MainWindow *mw)
+       : main_window(mw)
+{
+       gl_widget = new GLWidget;
+
+       QVBoxLayout *mainLayout = new QVBoxLayout;
+       QHBoxLayout *container = new QHBoxLayout;
+       container->addWidget(gl_widget);
+
+       QWidget *w = new QWidget;
+       w->setLayout(container);
+       mainLayout->addWidget(w);
+
+       setLayout(mainLayout);
+
+       setWindowTitle(tr("Nageru"));
+}
diff --git a/window.h b/window.h
new file mode 100644 (file)
index 0000000..ff883f4
--- /dev/null
+++ b/window.h
@@ -0,0 +1,21 @@
+#ifndef WINDOW_H
+#define WINDOW_H
+
+#include <QWidget>
+
+class GLWidget;
+class MainWindow;
+
+class Window : public QWidget
+{
+       Q_OBJECT
+
+public:
+       Window(MainWindow *mw);
+
+private:
+       GLWidget *gl_widget;
+       MainWindow *main_window;
+};
+
+#endif