4:4:4 encoding support

author Fiona Glaser <fiona@x264.com>

Wed, 22 Jun 2011 10:32:53 +0000 (03:32 -0700)

committer Fiona Glaser <fiona@x264.com>

Sun, 10 Jul 2011 04:15:52 +0000 (21:15 -0700)
author Fiona Glaser <fiona@x264.com>
Wed, 22 Jun 2011 10:32:53 +0000 (03:32 -0700)
committer Fiona Glaser <fiona@x264.com>
Sun, 10 Jul 2011 04:15:52 +0000 (21:15 -0700)
diff --git a/common/cabac.c b/common/cabac.c

index 75eee3f042302cff9048b93920ac57c4f2316272..eb6ab06a8bb93b33c0e124a9c59096af4724995d 100644 (file)
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -28,7 +28,7 @@
  #include "common.h"
  
  
-static const int8_t x264_cabac_context_init_I[460][2] =
+static const int8_t x264_cabac_context_init_I[1024][2] =
  {
      /* 0 - 10 */
      { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
@@ -194,10 +194,153 @@ static const int8_t x264_cabac_context_init_I[460][2] =
      { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
      {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
      {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
-    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
+
+    /* 460 -> 1024 */
+    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
+    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 },
+    { -12,  63 }, {  -2,  68 }, { -15,  84 }, { -13, 104 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
+    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
+    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
+    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
+    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
+    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
+    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
+    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
+    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
+    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
+    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
+    {  -7,  93 }, { -11,  87 }, {  -3,  77 }, {  -5,  71 },
+    {  -4,  63 }, {  -4,  68 }, { -12,  84 }, {  -7,  62 },
+    {  -7,  65 }, {   8,  61 }, {   5,  56 }, {  -2,  66 },
+    {   1,  64 }, {   0,  61 }, {  -2,  78 }, {   1,  50 },
+    {   7,  52 }, {  10,  35 }, {   0,  44 }, {  11,  38 },
+    {   1,  45 }, {   0,  46 }, {   5,  44 }, {  31,  17 },
+    {   1,  51 }, {   7,  50 }, {  28,  19 }, {  16,  33 },
+    {  14,  62 }, { -13, 108 }, { -15, 100 }, { -13, 101 },
+    { -13,  91 }, { -12,  94 }, { -10,  88 }, { -16,  84 },
+    { -10,  86 }, {  -7,  83 }, { -13,  87 }, { -19,  94 },
+    {   1,  70 }, {   0,  72 }, {  -5,  74 }, {  18,  59 },
+    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
+    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
+    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
+    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
+    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
+    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
+    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
+    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
+    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
+    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
+    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
+    {  24,   0 }, {  15,   9 }, {   8,  25 }, {  13,  18 },
+    {  15,   9 }, {  13,  19 }, {  10,  37 }, {  12,  18 },
+    {   6,  29 }, {  20,  33 }, {  15,  30 }, {   4,  45 },
+    {   1,  58 }, {   0,  62 }, {   7,  61 }, {  12,  38 },
+    {  11,  45 }, {  15,  39 }, {  11,  42 }, {  13,  44 },
+    {  16,  45 }, {  12,  41 }, {  10,  49 }, {  30,  34 },
+    {  18,  42 }, {  10,  55 }, {  17,  51 }, {  17,  46 },
+    {   0,  89 }, {  26, -19 }, {  22, -17 }, {  26, -17 },
+    {  30, -25 }, {  28, -20 }, {  33, -23 }, {  37, -27 },
+    {  33, -23 }, {  40, -28 }, {  38, -17 }, {  33, -11 },
+    {  40, -15 }, {  41,  -6 }, {  38,   1 }, {  41,  17 },
+    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
+    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
+    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
+    { -23,  68 }, { -24,  50 }, { -11,  74 }, { -14, 106 },
+    { -13,  97 }, { -15,  90 }, { -12,  90 }, { -18,  88 },
+    { -10,  73 }, {  -9,  79 }, { -14,  86 }, { -10,  73 },
+    { -10,  70 }, { -10,  69 }, {  -5,  66 }, {  -9,  64 },
+    {  -5,  58 }, {   2,  59 }, {  23, -13 }, {  26, -13 },
+    {  40, -15 }, {  49, -14 }, {  44,   3 }, {  45,   6 },
+    {  44,  34 }, {  33,  54 }, {  19,  82 }, {  21, -10 },
+    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
+    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 },
+    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
+    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
+    {   0,  68 }, {  -9,  92 }, { -17, 120 }, { -20, 112 },
+    { -18, 114 }, { -11,  85 }, { -15,  92 }, { -14,  89 },
+    { -26,  71 }, { -15,  81 }, { -14,  80 }, {   0,  68 },
+    { -14,  70 }, { -24,  56 }, { -23,  68 }, { -24,  50 },
+    { -11,  74 }, { -14, 106 }, { -13,  97 }, { -15,  90 },
+    { -12,  90 }, { -18,  88 }, { -10,  73 }, {  -9,  79 },
+    { -14,  86 }, { -10,  73 }, { -10,  70 }, { -10,  69 },
+    {  -5,  66 }, {  -9,  64 }, {  -5,  58 }, {   2,  59 },
+    {  23, -13 }, {  26, -13 }, {  40, -15 }, {  49, -14 },
+    {  44,   3 }, {  45,   6 }, {  44,  34 }, {  33,  54 },
+    {  19,  82 }, {  21, -10 }, {  24, -11 }, {  28,  -8 },
+    {  28,  -1 }, {  29,   3 }, {  29,   9 }, {  35,  20 },
+    {  29,  36 }, {  14,  67 }, {  -3,  75 }, {  -1,  23 },
+    {   1,  34 }, {   1,  43 }, {   0,  54 }, {  -2,  55 },
+    {   0,  61 }, {   1,  64 }, {   0,  68 }, {  -9,  92 },
+    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
+    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
+    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
+    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
+    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
+    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
+    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
+    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
+    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
+    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
+    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
+    {  -6,  93 }, {  -6,  84 }, {  -8,  79 }, {   0,  66 },
+    {  -1,  71 }, {   0,  62 }, {  -2,  60 }, {  -2,  59 },
+    {  -5,  75 }, {  -3,  62 }, {  -4,  58 }, {  -9,  66 },
+    {  -1,  79 }, {   0,  71 }, {   3,  68 }, {  10,  44 },
+    {  -7,  62 }, {  15,  36 }, {  14,  40 }, {  16,  27 },
+    {  12,  29 }, {   1,  44 }, {  20,  36 }, {  18,  32 },
+    {   5,  42 }, {   1,  48 }, {  10,  62 }, {  17,  46 },
+    {   9,  64 }, { -12, 104 }, { -11,  97 }, { -16,  96 },
+    {  -7,  88 }, {  -8,  85 }, {  -7,  85 }, {  -9,  85 },
+    { -13,  88 }, {   4,  66 }, {  -3,  77 }, {  -3,  76 },
+    {  -6,  76 }, {  10,  58 }, {  -1,  76 }, {  -1,  83 },
+    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
+    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
+    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
+    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
+    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
+    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
+    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
+    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
+    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
+    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
+    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
+    {  15,   6 }, {   6,  19 }, {   7,  16 }, {  12,  14 },
+    {  18,  13 }, {  13,  11 }, {  13,  15 }, {  15,  16 },
+    {  12,  23 }, {  13,  23 }, {  15,  20 }, {  14,  26 },
+    {  14,  44 }, {  17,  40 }, {  17,  47 }, {  24,  17 },
+    {  21,  21 }, {  25,  22 }, {  31,  27 }, {  22,  29 },
+    {  19,  35 }, {  14,  50 }, {  10,  57 }, {   7,  63 },
+    {  -2,  77 }, {  -4,  82 }, {  -3,  94 }, {   9,  69 },
+    { -12, 109 }, {  36, -35 }, {  36, -34 }, {  32, -26 },
+    {  37, -30 }, {  44, -32 }, {  34, -18 }, {  34, -15 },
+    {  40, -15 }, {  33,  -7 }, {  35,  -5 }, {  33,   0 },
+    {  38,   2 }, {  33,  13 }, {  23,  35 }, {  13,  58 },
+    {  -3,  71 }, {  -6,  42 }, {  -5,  50 }, {  -3,  54 },
+    {  -2,  62 }, {   0,  58 }, {   1,  63 }, {  -2,  72 },
+    {  -1,  74 }, {  -9,  91 }, {  -5,  67 }, {  -5,  27 },
+    {  -3,  39 }, {  -2,  44 }, {   0,  46 }, { -16,  64 },
+    {  -8,  68 }, { -10,  78 }, {  -6,  77 }, { -10,  86 },
+    { -12,  92 }, { -15,  55 }, { -10,  60 }, {  -6,  62 },
+    {  -4,  65 }, { -12,  73 }, {  -8,  76 }, {  -7,  80 },
+    {  -9,  88 }, { -17, 110 }, {  -3,  71 }, {  -6,  42 },
+    {  -5,  50 }, {  -3,  54 }, {  -2,  62 }, {   0,  58 },
+    {   1,  63 }, {  -2,  72 }, {  -1,  74 }, {  -9,  91 },
+    {  -5,  67 }, {  -5,  27 }, {  -3,  39 }, {  -2,  44 },
+    {   0,  46 }, { -16,  64 }, {  -8,  68 }, { -10,  78 },
+    {  -6,  77 }, { -10,  86 }, { -12,  92 }, { -15,  55 },
+    { -10,  60 }, {  -6,  62 }, {  -4,  65 }, { -12,  73 },
+    {  -8,  76 }, {  -7,  80 }, {  -9,  88 }, { -17, 110 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 },
+    {  -3,  70 }, {  -8,  93 }, { -10,  90 }, { -30, 127 }
  };
  
-static const int8_t x264_cabac_context_init_PB[3][460][2] =
+static const int8_t x264_cabac_context_init_PB[3][1024][2] =
  {
      /* i_cabac_init_idc == 0 */
      {
@@ -353,6 +496,149 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =
          { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
          {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
          {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
+
+        /* 460 - 1024 */
+        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
+        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -7,  92 }, {  -5,  89 }, {  -7,  96 }, { -13, 108 },
+        {  -3,  46 }, {  -1,  65 }, {  -1,  57 }, {  -9,  93 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
+        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
+        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
+        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
+        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
+        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
+        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
+        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
+        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
+        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
+        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
+        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
+        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
+        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
+        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
+        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
+        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
+        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
+        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
+        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
+        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
+        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
+        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
+        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
+        { -16,  66 }, { -22,  65 }, { -20,  63 }, {  -5,  85 },
+        {  -6,  81 }, { -10,  77 }, {  -7,  81 }, { -17,  80 },
+        { -18,  73 }, {  -4,  74 }, { -10,  83 }, {  -9,  71 },
+        {  -9,  67 }, {  -1,  61 }, {  -8,  66 }, { -14,  66 },
+        {   0,  59 }, {   2,  59 }, {   9,  -2 }, {  26,  -9 },
+        {  33,  -9 }, {  39,  -7 }, {  41,  -2 }, {  45,   3 },
+        {  49,   9 }, {  45,  27 }, {  36,  59 }, {  21, -13 },
+        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
+        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
+        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
+        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
+        {  -8,  66 }, {  -8,  76 }, {  -4,  79 }, {  -7,  71 },
+        {  -5,  69 }, {  -9,  70 }, {  -8,  66 }, { -10,  68 },
+        { -19,  73 }, { -12,  69 }, { -16,  70 }, { -15,  67 },
+        { -20,  62 }, { -19,  70 }, { -16,  66 }, { -22,  65 },
+        { -20,  63 }, {  -5,  85 }, {  -6,  81 }, { -10,  77 },
+        {  -7,  81 }, { -17,  80 }, { -18,  73 }, {  -4,  74 },
+        { -10,  83 }, {  -9,  71 }, {  -9,  67 }, {  -1,  61 },
+        {  -8,  66 }, { -14,  66 }, {   0,  59 }, {   2,  59 },
+        {   9,  -2 }, {  26,  -9 }, {  33,  -9 }, {  39,  -7 },
+        {  41,  -2 }, {  45,   3 }, {  49,   9 }, {  45,  27 },
+        {  36,  59 }, {  21, -13 }, {  33, -14 }, {  39,  -7 },
+        {  46,  -2 }, {  51,   2 }, {  60,   6 }, {  61,  17 },
+        {  55,  34 }, {  42,  62 }, {  -6,  66 }, {  -7,  35 },
+        {  -7,  42 }, {  -8,  45 }, {  -5,  48 }, { -12,  56 },
+        {  -6,  60 }, {  -5,  62 }, {  -8,  66 }, {  -8,  76 },
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
+        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
+        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
+        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
+        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
+        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
+        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
+        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
+        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
+        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
+        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
+        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
+        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
+        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
+        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
+        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
+        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
+        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
+        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
+        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
+        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
+        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
+        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
+        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
+        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
+        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
+        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
+        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
+        {  -3,  74 }, { -10,  90 }, {  -6,  76 }, {  -2,  44 },
+        {   0,  45 }, {   0,  52 }, {  -3,  64 }, {  -2,  59 },
+        {  -4,  70 }, {  -4,  75 }, {  -8,  82 }, { -17, 102 },
+        {  -9,  77 }, {   3,  24 }, {   0,  42 }, {   0,  48 },
+        {   0,  55 }, {  -6,  59 }, {  -7,  71 }, { -12,  83 },
+        { -11,  87 }, { -30, 119 }, {   1,  58 }, {  -3,  29 },
+        {  -1,  36 }, {   1,  38 }, {   2,  43 }, {  -6,  55 },
+        {   0,  58 }, {   0,  64 }, {  -3,  74 }, { -10,  90 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 },
+        {  -3,  74 }, {  -9,  92 }, {  -8,  87 }, { -23, 126 }
      },
  
      /* i_cabac_init_idc == 1 */
@@ -509,6 +795,149 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =
          {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
          {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
          {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+
+        /* 460 - 1024 */
+        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
+        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        {   0,  80 }, {  -5,  89 }, {  -7,  94 }, {  -4,  92 },
+        {   0,  39 }, {   0,  65 }, { -15,  84 }, { -35, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
+        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
+        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
+        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
+        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
+        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
+        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
+        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
+        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
+        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
+        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
+        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
+        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
+        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
+        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
+        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
+        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
+        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
+        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
+        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
+        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
+        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
+        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
+        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
+        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  -3,  81 },
+        {  -3,  76 }, {  -7,  72 }, {  -6,  78 }, { -12,  72 },
+        { -14,  68 }, {  -3,  70 }, {  -6,  76 }, {  -5,  66 },
+        {  -5,  62 }, {   0,  57 }, {  -4,  61 }, {  -9,  60 },
+        {   1,  54 }, {   2,  58 }, {  17, -10 }, {  32, -13 },
+        {  42,  -9 }, {  49,  -5 }, {  53,   0 }, {  64,   3 },
+        {  68,  10 }, {  66,  27 }, {  47,  57 }, {  17, -10 },
+        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
+        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
+        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
+        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
+        {  -4,  67 }, {  -7,  82 }, {  -5,  85 }, {  -6,  81 },
+        { -10,  77 }, {  -7,  81 }, { -17,  80 }, { -18,  73 },
+        {  -4,  74 }, { -10,  83 }, {  -9,  71 }, {  -9,  67 },
+        {  -1,  61 }, {  -8,  66 }, { -14,  66 }, {   0,  59 },
+        {   2,  59 }, {  -3,  81 }, {  -3,  76 }, {  -7,  72 },
+        {  -6,  78 }, { -12,  72 }, { -14,  68 }, {  -3,  70 },
+        {  -6,  76 }, {  -5,  66 }, {  -5,  62 }, {   0,  57 },
+        {  -4,  61 }, {  -9,  60 }, {   1,  54 }, {   2,  58 },
+        {  17, -10 }, {  32, -13 }, {  42,  -9 }, {  49,  -5 },
+        {  53,   0 }, {  64,   3 }, {  68,  10 }, {  66,  27 },
+        {  47,  57 }, {  17, -10 }, {  32, -13 }, {  42,  -9 },
+        {  49,  -5 }, {  53,   0 }, {  64,   3 }, {  68,  10 },
+        {  66,  27 }, {  47,  57 }, {  -5,  71 }, {   0,  24 },
+        {  -1,  36 }, {  -2,  42 }, {  -2,  52 }, {  -9,  57 },
+        {  -6,  63 }, {  -4,  65 }, {  -4,  67 }, {  -7,  82 },
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
+        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
+        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
+        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
+        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
+        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
+        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
+        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
+        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
+        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
+        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
+        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
+        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
+        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
+        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
+        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
+        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
+        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
+        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
+        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
+        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
+        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
+        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
+        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
+        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
+        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
+        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
+        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
+        {  -5,  74 }, {  -9,  86 }, { -23, 112 }, { -15,  71 },
+        {  -7,  61 }, {   0,  53 }, {  -5,  66 }, { -11,  77 },
+        {  -9,  80 }, {  -9,  84 }, { -10,  87 }, { -34, 127 },
+        { -21, 101 }, {  -3,  39 }, {  -5,  53 }, {  -7,  61 },
+        { -11,  75 }, { -15,  77 }, { -17,  91 }, { -25, 107 },
+        { -25, 111 }, { -28, 122 }, { -11,  76 }, { -10,  44 },
+        { -10,  52 }, { -10,  57 }, {  -9,  58 }, { -16,  72 },
+        {  -7,  69 }, {  -4,  69 }, {  -5,  74 }, {  -9,  86 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 },
+        {  -2,  73 }, { -12, 104 }, {  -9,  91 }, { -31, 127 }
      },
  
      /* i_cabac_init_idc == 2 */
@@ -665,6 +1094,149 @@ static const int8_t x264_cabac_context_init_PB[3][460][2] =
          { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
          {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
          {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+
+        /* 460 - 1024 */
+        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
+        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  11,  80 }, {   5,  76 }, {   2,  84 }, {   5,  78 },
+        {  -6,  55 }, {   4,  61 }, { -14,  83 }, { -37, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
+        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
+        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
+        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
+        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
+        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
+        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
+        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
+        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
+        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
+        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
+        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
+        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
+        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
+        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
+        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
+        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
+        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
+        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
+        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
+        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
+        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
+        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
+        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
+        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {  -3,  78 },
+        {  -8,  74 }, {  -9,  72 }, { -10,  72 }, { -18,  75 },
+        { -12,  71 }, { -11,  63 }, {  -5,  70 }, { -17,  75 },
+        { -14,  72 }, { -16,  67 }, {  -8,  53 }, { -14,  59 },
+        {  -9,  52 }, { -11,  68 }, {   9,  -2 }, {  30, -10 },
+        {  31,  -4 }, {  33,  -1 }, {  33,   7 }, {  31,  12 },
+        {  37,  23 }, {  31,  38 }, {  20,  64 }, {   9,  -2 },
+        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
+        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
+        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
+        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
+        {  -6,  68 }, { -10,  79 }, {  -3,  78 }, {  -8,  74 },
+        {  -9,  72 }, { -10,  72 }, { -18,  75 }, { -12,  71 },
+        { -11,  63 }, {  -5,  70 }, { -17,  75 }, { -14,  72 },
+        { -16,  67 }, {  -8,  53 }, { -14,  59 }, {  -9,  52 },
+        { -11,  68 }, {  -3,  78 }, {  -8,  74 }, {  -9,  72 },
+        { -10,  72 }, { -18,  75 }, { -12,  71 }, { -11,  63 },
+        {  -5,  70 }, { -17,  75 }, { -14,  72 }, { -16,  67 },
+        {  -8,  53 }, { -14,  59 }, {  -9,  52 }, { -11,  68 },
+        {   9,  -2 }, {  30, -10 }, {  31,  -4 }, {  33,  -1 },
+        {  33,   7 }, {  31,  12 }, {  37,  23 }, {  31,  38 },
+        {  20,  64 }, {   9,  -2 }, {  30, -10 }, {  31,  -4 },
+        {  33,  -1 }, {  33,   7 }, {  31,  12 }, {  37,  23 },
+        {  31,  38 }, {  20,  64 }, {  -9,  71 }, {  -7,  37 },
+        {  -8,  44 }, { -11,  49 }, { -10,  56 }, { -12,  59 },
+        {  -8,  63 }, {  -9,  67 }, {  -6,  68 }, { -10,  79 },
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
+        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
+        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
+        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
+        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
+        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
+        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
+        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
+        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
+        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
+        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
+        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
+        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
+        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
+        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
+        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
+        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
+        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
+        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
+        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
+        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
+        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
+        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
+        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
+        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
+        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
+        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
+        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
+        { -12,  92 }, { -18, 108 }, { -24, 115 }, { -22,  82 },
+        {  -9,  62 }, {   0,  53 }, {   0,  59 }, { -14,  85 },
+        { -13,  89 }, { -13,  94 }, { -11,  92 }, { -29, 127 },
+        { -21, 100 }, { -14,  57 }, { -12,  67 }, { -11,  71 },
+        { -10,  77 }, { -21,  85 }, { -16,  88 }, { -23, 104 },
+        { -15,  98 }, { -37, 127 }, { -10,  82 }, {  -8,  48 },
+        {  -8,  61 }, {  -8,  66 }, {  -7,  70 }, { -14,  75 },
+        { -10,  79 }, {  -9,  83 }, { -12,  92 }, { -18, 108 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 },
+        {  -5,  79 }, { -11, 104 }, { -11,  91 }, { -30, 127 }
      }
  };
  
@@ -753,16 +1325,17 @@ const uint16_t x264_cabac_entropy[128] =
      FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000)
  };
  
-uint8_t x264_cabac_contexts[4][QP_MAX_SPEC+1][460];
+uint8_t x264_cabac_contexts[4][QP_MAX_SPEC+1][1024];
  
-void x264_cabac_init( void )
+void x264_cabac_init( x264_t *h )
  {
+    int ctx_count = CHROMA444 ? 1024 : 460;
      for( int i = 0; i < 4; i++ )
      {
-        const int8_t (*cabac_context_init)[460][2] = i == 0 ? &x264_cabac_context_init_I
-                                                            : &x264_cabac_context_init_PB[i-1];
+        const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I
+                                                             : &x264_cabac_context_init_PB[i-1];
          for( int qp = 0; qp <= QP_MAX_SPEC; qp++ )
-            for( int j = 0; j < 460; j++ )
+            for( int j = 0; j < ctx_count; j++ )
              {
                  int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 );
                  x264_cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6);
@@ -773,9 +1346,9 @@ void x264_cabac_init( void )
  /*****************************************************************************
   *
   *****************************************************************************/
-void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
+void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model )
  {
-    memcpy( cb->state, x264_cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], 460 );
+    memcpy( cb->state, x264_cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 );
  }
  
  void x264_cabac_encode_init_core( x264_cabac_t *cb )
diff --git a/common/cabac.h b/common/cabac.h

index ffd0dd5e2ef277a24b72b1ba1d04179f79e7b0e1..9e67395181439d8d7e05f17cf5724e08dae2f36d 100644 (file)
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -45,14 +45,17 @@ typedef struct
      ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
  
      /* context */
-    uint8_t state[460];
+    uint8_t state[1024];
+
+    /* for 16-byte alignment */
+    uint8_t padding[12];
  } x264_cabac_t;
  
  extern const uint8_t x264_cabac_transition[128][2];
  extern const uint16_t x264_cabac_entropy[128];
  
  /* init the contexts given i_slice_type, the quantif and the model */
-void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
+void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model );
  
  void x264_cabac_encode_init_core( x264_cabac_t *cb );
  void x264_cabac_encode_init ( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end );
diff --git a/common/common.c b/common/common.c

index ff23cf4fe5c2ee9e3d648a6409547ee2c975eecb..99cb3d2f7fc9ad2558f672592d3c445b51211f9b 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -151,11 +151,13 @@ void x264_param_default( x264_param_t *param )
  
      param->i_cqm_preset = X264_CQM_FLAT;
      memset( param->cqm_4iy, 16, sizeof( param->cqm_4iy ) );
-    memset( param->cqm_4ic, 16, sizeof( param->cqm_4ic ) );
      memset( param->cqm_4py, 16, sizeof( param->cqm_4py ) );
+    memset( param->cqm_4ic, 16, sizeof( param->cqm_4ic ) );
      memset( param->cqm_4pc, 16, sizeof( param->cqm_4pc ) );
      memset( param->cqm_8iy, 16, sizeof( param->cqm_8iy ) );
      memset( param->cqm_8py, 16, sizeof( param->cqm_8py ) );
+    memset( param->cqm_8ic, 16, sizeof( param->cqm_8ic ) );
+    memset( param->cqm_8pc, 16, sizeof( param->cqm_8pc ) );
  
      param->b_repeat_headers = 1;
      param->b_annexb = 1;
@@ -763,8 +765,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
      {
          p->i_cqm_preset = X264_CQM_CUSTOM;
          b_error |= parse_cqm( value, p->cqm_4iy, 16 );
-        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
          b_error |= parse_cqm( value, p->cqm_4py, 16 );
+        b_error |= parse_cqm( value, p->cqm_4ic, 16 );
          b_error |= parse_cqm( value, p->cqm_4pc, 16 );
      }
      OPT("cqm8")
@@ -772,6 +774,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          p->i_cqm_preset = X264_CQM_CUSTOM;
          b_error |= parse_cqm( value, p->cqm_8iy, 64 );
          b_error |= parse_cqm( value, p->cqm_8py, 64 );
+        b_error |= parse_cqm( value, p->cqm_8ic, 64 );
+        b_error |= parse_cqm( value, p->cqm_8pc, 64 );
      }
      OPT("cqm4i")
      {
@@ -809,11 +813,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
      {
          p->i_cqm_preset = X264_CQM_CUSTOM;
          b_error |= parse_cqm( value, p->cqm_8iy, 64 );
+        b_error |= parse_cqm( value, p->cqm_8ic, 64 );
      }
      OPT("cqm8p")
      {
          p->i_cqm_preset = X264_CQM_CUSTOM;
          b_error |= parse_cqm( value, p->cqm_8py, 64 );
+        b_error |= parse_cqm( value, p->cqm_8pc, 64 );
      }
      OPT("log")
          p->i_log_level = atoi(value);
diff --git a/common/common.h b/common/common.h

index cfee673517f280ed54269dfea574f85b2f48d554..a4e1cf96a413849bd0cc91c5de0e1ce50d727574 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -56,7 +56,7 @@ do {\
  #define X264_BFRAME_MAX 16
  #define X264_REF_MAX 16
  #define X264_THREAD_MAX 128
-#define X264_PCM_COST (384*BIT_DEPTH+16)
+#define X264_PCM_COST ((384<<CHROMA444)*BIT_DEPTH+16)
  #define X264_LOOKAHEAD_MAX 250
  #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
  #define QP_MAX_SPEC (51+QP_BD_OFFSET)
@@ -102,6 +102,8 @@ do {\
  #   define PARAM_INTERLACED 0
  #endif
  
+#define CHROMA444 (h->sps->i_chroma_format_idc == 3)
+
  /* Unions for type-punning.
   * Mn: load or store n bits, aligned, native-endian
   * CPn: copy n bits, aligned, native-endian
@@ -143,41 +145,49 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
  
  #define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
  
-#define X264_SCAN8_SIZE (6*8)
  #define X264_SCAN8_LUMA_SIZE (5*8)
+#define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3)
  #define X264_SCAN8_0 (4+1*8)
  
-static const unsigned x264_scan8[16+2*4+3] =
+/* Scan8 organization:
+ *    0 1 2 3 4 5 6 7
+ * 0  DY    y y y y y
+ * 1        y Y Y Y Y
+ * 2        y Y Y Y Y
+ * 3        y Y Y Y Y
+ * 4        y Y Y Y Y
+ * 5  DU    u u u u u
+ * 6        u U U U U
+ * 7        u U U U U
+ * 8        u U U U U
+ * 9        u U U U U
+ * 10 DV    v v v v v
+ * 11       v V V V V
+ * 12       v V V V V
+ * 13       v V V V V
+ * 14       v V V V V
+ * DY/DU/DV are for luma/chroma DC.
+ */
+
+#define LUMA_DC   48
+#define CHROMA_DC 49
+
+static const uint8_t x264_scan8[16*3 + 3] =
  {
-    /* Luma */
-    4+1*8, 5+1*8, 4+2*8, 5+2*8,
-    6+1*8, 7+1*8, 6+2*8, 7+2*8,
-    4+3*8, 5+3*8, 4+4*8, 5+4*8,
-    6+3*8, 7+3*8, 6+4*8, 7+4*8,
-
-    /* Cb */
-    1+1*8, 2+1*8,
-    1+2*8, 2+2*8,
-
-    /* Cr */
-    1+4*8, 2+4*8,
-    1+5*8, 2+5*8,
-
-    /* Luma DC */
-    4+5*8,
-
-    /* Chroma DC */
-    6+5*8, 7+5*8
+    4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8,
+    6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8,
+    4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8,
+    6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8,
+    4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8,
+    6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8,
+    4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8,
+    6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8,
+    4+11*8, 5+11*8, 4+12*8, 5+12*8,
+    6+11*8, 7+11*8, 6+12*8, 7+12*8,
+    4+13*8, 5+13*8, 4+14*8, 5+14*8,
+    6+13*8, 7+13*8, 6+14*8, 7+14*8,
+    0+ 0*8, 0+ 5*8, 0+10*8
  };
-/*
-   0 1 2 3 4 5 6 7
- 0
- 1   B B   L L L L
- 2   B B   L L L L
- 3         L L L L
- 4   R R   L L L L
- 5   R R   Dy  DuDv
-*/
  
  #include "x264.h"
  #include "bitstream.h"
@@ -216,7 +226,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  void x264_reduce_fraction( uint32_t *n, uint32_t *d );
  void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
  void x264_cavlc_init( void );
-void x264_cabac_init( void );
+void x264_cabac_init( x264_t *h );
  
  static ALWAYS_INLINE pixel x264_clip_pixel( int x )
  {
@@ -468,16 +478,16 @@ struct x264_t
  
      /* quantization matrix for decoding, [cqm][qp%6][coef] */
      int             (*dequant4_mf[4])[16];   /* [4][6][16] */
-    int             (*dequant8_mf[2])[64];   /* [2][6][64] */
+    int             (*dequant8_mf[4])[64];   /* [4][6][64] */
      /* quantization matrix for trellis, [cqm][qp][coef] */
      int             (*unquant4_mf[4])[16];   /* [4][52][16] */
-    int             (*unquant8_mf[2])[64];   /* [2][52][64] */
+    int             (*unquant8_mf[4])[64];   /* [4][52][64] */
      /* quantization matrix for deadzone */
      udctcoef        (*quant4_mf[4])[16];     /* [4][52][16] */
-    udctcoef        (*quant8_mf[2])[64];     /* [2][52][64] */
+    udctcoef        (*quant8_mf[4])[64];     /* [4][52][64] */
      udctcoef        (*quant4_bias[4])[16];   /* [4][52][16] */
-    udctcoef        (*quant8_bias[2])[64];   /* [2][52][64] */
-    udctcoef        (*nr_offset_emergency)[3][64];
+    udctcoef        (*quant8_bias[4])[64];   /* [4][52][64] */
+    udctcoef        (*nr_offset_emergency)[4][64];
  
      /* mv/ref cost arrays. */
      uint16_t *cost_mv[QP_MAX+1];
@@ -554,11 +564,11 @@ struct x264_t
      /* Current MB DCT coeffs */
      struct
      {
-        ALIGNED_16( dctcoef luma16x16_dc[16] );
+        ALIGNED_16( dctcoef luma16x16_dc[3][16] );
          ALIGNED_16( dctcoef chroma_dc[2][4] );
          // FIXME share memory?
-        ALIGNED_16( dctcoef luma8x8[4][64] );
-        ALIGNED_16( dctcoef luma4x4[16+8][16] );
+        ALIGNED_16( dctcoef luma8x8[12][64] );
+        ALIGNED_16( dctcoef luma4x4[16*3][16] );
      } dct;
  
      /* MB table and cache for current frame/mb */
@@ -647,7 +657,7 @@ struct x264_t
          int16_t *cbp;                       /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc  (all set for PCM)*/
          int8_t  (*intra4x4_pred_mode)[8];   /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
                                              /* actually has only 7 entries; set to 8 for write-combining optimizations */
-        uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
+        uint8_t (*non_zero_count)[16*3];    /* nzc. for I_PCM set to 16 */
          int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
          int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
          uint8_t (*mvd[2])[8][2];            /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
@@ -693,8 +703,8 @@ struct x264_t
              /* space for p_fenc and p_fdec */
  #define FENC_STRIDE 16
  #define FDEC_STRIDE 32
-            ALIGNED_16( pixel fenc_buf[24*FENC_STRIDE] );
-            ALIGNED_16( pixel fdec_buf[27*FDEC_STRIDE] );
+            ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
+            ALIGNED_16( pixel fdec_buf[52*FDEC_STRIDE] );
  
              /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
              ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
@@ -717,14 +727,15 @@ struct x264_t
              /* pointer over mb of the frame to be compressed */
              pixel *p_fenc[3]; /* y,u,v */
              /* pointer to the actual source frame, not a block copy */
-            pixel *p_fenc_plane[2]; /* y,uv */
+            pixel *p_fenc_plane[3];
  
              /* pointer over mb of the frame to be reconstructed  */
              pixel *p_fdec[3];
  
              /* pointer over mb of the references */
              int i_fref[2];
-            pixel *p_fref[2][X264_REF_MAX*2][4+1]; /* last: yN, yH, yV, yHV, uv */
+            /* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */
+            pixel *p_fref[2][X264_REF_MAX*2][12];
              pixel *p_fref_w[X264_REF_MAX*2];  /* weighted fullpel luma */
              uint16_t *p_integral[2][X264_REF_MAX];
  
@@ -862,18 +873,18 @@ struct x264_t
  
      } stat;
  
-    /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4 */
+    /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */
      udctcoef (*nr_offset)[64];
      uint32_t (*nr_residual_sum)[64];
      uint32_t *nr_count;
  
-    ALIGNED_16( udctcoef nr_offset_denoise[3][64] );
-    ALIGNED_16( uint32_t nr_residual_sum_buf[2][3][64] );
-    uint32_t nr_count_buf[2][3];
+    ALIGNED_16( udctcoef nr_offset_denoise[4][64] );
+    ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
+    uint32_t nr_count_buf[2][4];
  
      /* Buffers that are allocated per-thread even in sliced threads. */
      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
-    pixel *intra_border_backup[5][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+    pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
      /* Deblock strength values are stored for each 4x4 partition. In MBAFF
       * there are four extra values that need to be stored, located in [4][i]. */
      uint8_t (*deblock_strength[2])[2][8][4];
diff --git a/common/deblock.c b/common/deblock.c

index e8718571e6fd59cc34a39c6d2e775a6e4c2c7391..812f4ea9ba8f664b93fdb4ce7ab69233ffd85a83 100644 (file)
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -354,7 +354,7 @@ void deblock_strength_mbaff_c( uint8_t nnz_cache[X264_SCAN8_SIZE], int8_t ref[2]
              else
              {
                  const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1];
-                uint8_t (*nnz)[24] = h->mb.non_zero_count;
+                uint8_t (*nnz)[48] = h->mb.non_zero_count;
  
                  for( int i = 0; i < 8; i++ )
                  {
@@ -415,7 +415,7 @@ void deblock_strength_mbaff_c( uint8_t nnz_cache[X264_SCAN8_SIZE], int8_t ref[2]
              for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride )
              {
                  int mbn_intra = IS_INTRA( h->mb.type[mbn_xy] );
-                uint8_t (*nnz)[24] = h->mb.non_zero_count;
+                uint8_t (*nnz)[48] = h->mb.non_zero_count;
  
                  uint32_t nnz_top[4];
                  nnz_top[0] = nnz[mbn_xy][3*4+0];
@@ -502,11 +502,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];
  
          pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
-        pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x;
+        pixel *pixuv = h->fdec->plane[1] + (8<<CHROMA444)*mb_y*strideuv + 16*mb_x;
+        intptr_t uvdiff = CHROMA444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
          if( mb_y & MB_INTERLACED )
          {
              pixy -= 15*stridey;
-            pixuv -= 7*strideuv;
+            pixuv -= ((8<<CHROMA444)-1)*strideuv;
          }
  
          int stride2y  = stridey << MB_INTERLACED;
@@ -521,7 +522,16 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
              deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
                                   stride2y, bs[dir][edge], qp, 0,\
                                   h->loopf.deblock_luma##intra[dir] );\
-            if( !(edge & 1) )\
+            if( CHROMA444 )\
+            {\
+                deblock_edge##intra( h, pixuv          + 4*edge*(dir?stride2uv:1),\
+                                     stride2uv, bs[dir][edge], chroma_qp, 0,\
+                                     h->loopf.deblock_luma##intra[dir] );\
+                deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
+                                     stride2uv, bs[dir][edge], chroma_qp, 0,\
+                                     h->loopf.deblock_luma##intra[dir] );\
+            }\
+            else if( !(edge & 1) )\
                  deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\
                                       stride2uv, bs[dir][edge], chroma_qp, 1,\
                                       h->loopf.deblock_chroma##intra[dir] );\
@@ -535,38 +545,45 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                  int chroma_qp[2];
                  int left_qp[2];
                  int current_qp = h->mb.qp[mb_xy];
+                x264_deblock_inter_t luma_deblock = deblock_v_luma_mbaff_c;
+                x264_deblock_inter_t chroma_deblock = CHROMA444 ? deblock_v_luma_mbaff_c : deblock_v_chroma_mbaff_c;
+                x264_deblock_intra_t luma_intra_deblock = deblock_v_luma_intra_mbaff_c;
+                x264_deblock_intra_t chroma_intra_deblock = CHROMA444 ? deblock_v_luma_intra_mbaff_c : deblock_v_chroma_intra_mbaff_c;
+                int c = CHROMA444 ? 0 : 1;
+
                  left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
                  luma_qp[0] = (current_qp + left_qp[0] + 1) >> 1;
                  chroma_qp[0] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[0]] + 1) >> 1;
                  if( bs[0][0][0] == 4)
                  {
-                    deblock_edge_intra( h, pixy,      2*stridey,  bs[0][0], luma_qp[0],   0, deblock_v_luma_intra_mbaff_c );
-                    deblock_edge_intra( h, pixuv,     2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c );
-                    deblock_edge_intra( h, pixuv + 1, 2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_intra_mbaff_c );
+                    deblock_edge_intra( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   0, luma_intra_deblock );
+                    deblock_edge_intra( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], c, chroma_intra_deblock );
+                    deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_intra_deblock );
                  }
                  else
                  {
-                    deblock_edge( h, pixy,      2*stridey,  bs[0][0], luma_qp[0],   0, deblock_v_luma_mbaff_c );
-                    deblock_edge( h, pixuv,     2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c );
-                    deblock_edge( h, pixuv + 1, 2*strideuv, bs[0][0], chroma_qp[0], 1, deblock_v_chroma_mbaff_c );
+                    deblock_edge( h, pixy,           2*stridey,  bs[0][0], luma_qp[0],   0, luma_deblock );
+                    deblock_edge( h, pixuv,          2*strideuv, bs[0][0], chroma_qp[0], c, chroma_deblock );
+                    deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_deblock );
                  }
  
                  int offy = MB_INTERLACED ? 4 : 0;
                  int offuv = MB_INTERLACED ? 3 : 0;
+                if( CHROMA444 ) offuv = offy;
                  left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
                  luma_qp[1] = (current_qp + left_qp[1] + 1) >> 1;
                  chroma_qp[1] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
                  if( bs[0][4][0] == 4)
                  {
-                    deblock_edge_intra( h, pixy      + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   0, deblock_v_luma_intra_mbaff_c );
-                    deblock_edge_intra( h, pixuv     + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c );
-                    deblock_edge_intra( h, pixuv + 1 + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_intra_mbaff_c );
+                    deblock_edge_intra( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   0, luma_intra_deblock );
+                    deblock_edge_intra( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], c, chroma_intra_deblock );
+                    deblock_edge_intra( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], c, chroma_intra_deblock );
                  }
                  else
                  {
-                    deblock_edge( h, pixy      + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   0, deblock_v_luma_mbaff_c );
-                    deblock_edge( h, pixuv     + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_mbaff_c );
-                    deblock_edge( h, pixuv + 1 + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], 1, deblock_v_chroma_mbaff_c );
+                    deblock_edge( h, pixy           + (stridey<<offy),   2*stridey,  bs[0][4], luma_qp[1],   0, luma_deblock );
+                    deblock_edge( h, pixuv          + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], c, chroma_deblock );
+                    deblock_edge( h, pixuv + uvdiff + (strideuv<<offuv), 2*strideuv, bs[0][4], chroma_qp[1], c, chroma_deblock );
                  }
              }
              else
@@ -603,7 +620,13 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
  
                      // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows
                      deblock_edge( h, pixy      + j*stridey,  2* stridey, bs[1][4*j], qp_top,  0, deblock_v_luma_c );
-                    deblock_edge( h, pixuv     + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 1, deblock_v_chroma_c );
+                    if( CHROMA444 )
+                    {
+                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 0, deblock_v_luma_c );
+                        deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 0, deblock_v_luma_c );
+                    }
+                    else
+                        deblock_edge( h, pixuv          + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 1, deblock_v_chroma_c );
                  }
              }
              else
@@ -642,7 +665,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
   * TODO:
   *  deblock macroblock edges
   *  support analysis partitions smaller than 16x16
- *  deblock chroma
+ *  deblock chroma for 4:2:0
   *  handle duplicate refs correctly
   *  handle cavlc+8x8dct correctly
   */
@@ -650,6 +673,7 @@ void x264_macroblock_deblock( x264_t *h )
  {
      int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset );
      int qp = h->mb.i_qp;
+    int qpc = h->mb.i_chroma_qp;
      if( qp <= qp_thresh || h->mb.i_type == P_SKIP )
          return;
  
@@ -661,14 +685,22 @@ void x264_macroblock_deblock( x264_t *h )
                                     bs, 4 >> SLICE_MBAFF, h->sh.i_type == SLICE_TYPE_B, h );
  
      int transform_8x8 = h->mb.b_transform_8x8;
-    pixel *fdec = h->mb.pic.p_fdec[0];
  
      #define FILTER( dir, edge )\
      do\
      {\
-        deblock_edge( h, fdec + 4*edge*(dir?FDEC_STRIDE:1),\
+        deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\
                        FDEC_STRIDE, bs[dir][edge], qp, 0,\
                        h->loopf.deblock_luma[dir] );\
+        if( CHROMA444 )\
+        {\
+            deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\
+                          FDEC_STRIDE, bs[dir][edge], qpc, 0,\
+                          h->loopf.deblock_luma[dir] );\
+            deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\
+                          FDEC_STRIDE, bs[dir][edge], qpc, 0,\
+                          h->loopf.deblock_luma[dir] );\
+        }\
      } while(0)
  
      if( !transform_8x8 ) FILTER( 0, 1 );
diff --git a/common/frame.c b/common/frame.c

index 3d3f16a6e09760a577a7bd8052c1e6f1c680055f..daf446e248e56437317d332a8b80ff8ccc5d820b 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -42,16 +42,32 @@ static int align_plane_size( int x, int disalign )
      return x;
  }
  
-x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
+static int x264_frame_internal_csp( int external_csp )
  {
-    x264_frame_t *frame;
+    switch( external_csp & X264_CSP_MASK )
+    {
+        case X264_CSP_NV12:
+        case X264_CSP_I420:
+        case X264_CSP_YV12:
+            return X264_CSP_NV12;
+        case X264_CSP_I444:
+        case X264_CSP_YV24:
+            return X264_CSP_I444;
+        default:
+            return X264_CSP_NONE;
+    }
+}
  
+static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
+{
+    x264_frame_t *frame;
+    int i_csp = x264_frame_internal_csp( h->param.i_csp );
      int i_mb_count = h->mb.i_mb_count;
      int i_stride, i_width, i_lines;
      int i_padv = PADV << PARAM_INTERLACED;
-    int luma_plane_size, chroma_plane_size;
      int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
      int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
+    int luma_plane_count = i_csp == X264_CSP_NV12 ? 1 : 3;
  
      CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
  
@@ -60,13 +76,28 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      i_lines  = h->mb.i_mb_height*16;
      i_stride = align_stride( i_width + 2*PADH, align, disalign );
  
-    frame->i_plane = 2;
-    for( int i = 0; i < 2; i++ )
+    if( i_csp == X264_CSP_NV12 )
      {
-        frame->i_width[i] = i_width >> i;
-        frame->i_lines[i] = i_lines >> i;
-        frame->i_stride[i] = i_stride;
+        frame->i_plane = 2;
+        for( int i = 0; i < 2; i++ )
+        {
+            frame->i_width[i] = i_width >> i;
+            frame->i_lines[i] = i_lines >> i;
+            frame->i_stride[i] = i_stride;
+        }
      }
+    else if( i_csp == X264_CSP_I444 )
+    {
+        frame->i_plane = 3;
+        for( int i = 0; i < 3; i++ )
+        {
+            frame->i_width[i] = i_width;
+            frame->i_lines[i] = i_lines;
+            frame->i_stride[i] = i_stride;
+        }
+    }
+    else
+        goto fail;
  
      frame->i_width_lowres = frame->i_width[0]/2;
      frame->i_lines_lowres = frame->i_lines[0]/2;
@@ -95,40 +126,46 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  
      frame->orig = frame;
  
-    luma_plane_size = align_plane_size( frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv), disalign );
-    chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
-
-    CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
-    frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
-    if( PARAM_INTERLACED )
+    if( i_csp == X264_CSP_NV12 )
      {
-        CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
-        frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH;
+        int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
+        CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
+        if( PARAM_INTERLACED )
+        {
+            CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH;
+        }
      }
  
      /* all 4 luma planes allocated together, since the cacheline split code
       * requires them to be in-phase wrt cacheline alignment. */
-    if( h->param.analyse.i_subpel_refine && b_fdec )
+
+    for( int p = 0; p < luma_plane_count; p++ )
      {
-        /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
-        CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size * sizeof(pixel) );
-        if( PARAM_INTERLACED )
-            CHECKED_MALLOC( frame->buffer_fld[0], 4*luma_plane_size * sizeof(pixel) );
-        for( int i = 0; i < 4; i++ )
+        int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
+        if( h->param.analyse.i_subpel_refine && b_fdec )
          {
-            frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
-            frame->filtered_fld[i] = frame->buffer_fld[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+            /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
+            CHECKED_MALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
+            if( PARAM_INTERLACED )
+                CHECKED_MALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
+            for( int i = 0; i < 4; i++ )
+            {
+                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
+                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
+            }
+            frame->plane[p] = frame->filtered[p][0];
+            frame->plane_fld[p] = frame->filtered_fld[p][0];
+        }
+        else
+        {
+            CHECKED_MALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
+            if( PARAM_INTERLACED )
+                CHECKED_MALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
+            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
+            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
          }
-        frame->plane[0] = frame->filtered[0];
-        frame->plane_fld[0] = frame->filtered_fld[0];
-    }
-    else
-    {
-        CHECKED_MALLOC( frame->buffer[0], luma_plane_size * sizeof(pixel) );
-        if( PARAM_INTERLACED )
-            CHECKED_MALLOC( frame->buffer_fld[0], luma_plane_size * sizeof(pixel) );
-        frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
-        frame->filtered_fld[0] = frame->plane_fld[0] = frame->buffer_fld[0] + frame->i_stride[0] * i_padv + PADH;
      }
  
      frame->b_duplicate = 0;
@@ -168,7 +205,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      {
          if( h->frames.b_have_lowres )
          {
-            luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
+            int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
  
              CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
              for( int i = 0; i < 4; i++ )
@@ -318,7 +355,7 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
          h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                            stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>1 );
      }
-    else
+    else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_YV12 )
      {
          get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
          get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
@@ -327,6 +364,15 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
                                       (pixel*)pix[2], stride[2]/sizeof(pixel),
                                       h->param.i_width>>1, h->param.i_height>>1 );
      }
+    else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
+    {
+        get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 );
+        get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 );
+        h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
+                          stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height );
+        h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2],
+                          stride[2]/sizeof(pixel), h->param.i_width, h->param.i_height );
+    }
      return 0;
  }
  
@@ -410,32 +456,33 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
          return;
      for( int i = 0; i < frame->i_plane; i++ )
      {
+        int shift = i && !CHROMA444;
          int stride = frame->i_stride[i];
-        int width = 16*h->sps->i_mb_width;
-        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> !!i;
+        int width = 16*h->mb.i_mb_width;
+        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> shift;
          int padh = PADH;
-        int padv = PADV >> !!i;
+        int padv = PADV >> shift;
          // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
          if( b_end && !b_start )
-            height += 4 >> (!!i + SLICE_MBAFF);
+            height += 4 >> (shift + SLICE_MBAFF);
          pixel *pix;
          if( SLICE_MBAFF )
          {
              // border samples for each field are extended separately
-            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
-            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i );
-            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i );
+            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
+            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, shift );
+            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, shift );
  
-            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> !!i;
+            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> shift;
              if( b_end && !b_start )
-                height += 4 >> (!!i);
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i );
+                height += 4 >> shift;
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
          }
          else
          {
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i );
+            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
          }
      }
  }
@@ -446,25 +493,26 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y
       * but up to 3 of the horizontal ones may be wrong.
         we want to expand border from the last filtered pixel */
      int b_start = !mb_y;
-    int stride = frame->i_stride[0];
      int width = 16*h->mb.i_mb_width + 8;
      int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16;
      int padh = PADH - 4;
      int padv = PADV - 8;
-    for( int i = 1; i < 4; i++ )
-    {
-        // buffer: 8 luma, to match the hpel filter
-        pixel *pix;
-        if( SLICE_MBAFF )
+    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
+        for( int i = 1; i < 4; i++ )
          {
-            pix = frame->filtered_fld[i] + (16*mb_y - 16) * stride - 4;
-            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
-            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
-        }
+            int stride = frame->i_stride[p];
+            // buffer: 8 luma, to match the hpel filter
+            pixel *pix;
+            if( SLICE_MBAFF )
+            {
+                pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4;
+                plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
+                plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
+            }
  
-        pix = frame->filtered[i] + (16*mb_y - 8) * stride - 4;
-        plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
-    }
+            pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4;
+            plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 );
+        }
  }
  
  void x264_frame_expand_border_lowres( x264_frame_t *frame )
@@ -478,16 +526,17 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
      for( int i = 0; i < frame->i_plane; i++ )
      {
          int i_width = h->param.i_width;
-        int i_height = h->param.i_height >> !!i;
+        int shift = i && !CHROMA444;
+        int i_height = h->param.i_height >> shift;
          int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
-        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> !!i;
+        int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
  
          if( i_padx )
          {
              for( int y = 0; y < i_height; y++ )
                  pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
-                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-i],
-                              i_padx>>i, sizeof(pixel)<<i );
+                              &frame->plane[i][y*frame->i_stride[i] + i_width - 1-shift],
+                              i_padx>>shift, sizeof(pixel)<<shift );
          }
          if( i_pady )
          {
@@ -503,10 +552,11 @@ void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
  {
      for( int i = 0; i < h->fenc->i_plane; i++ )
      {
+        int shift = i && !CHROMA444;
          int stride = h->fenc->i_stride[i];
-        int height = h->param.i_height >> !!i;
-        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> !!i;
-        int mbsize = (16>>!!i);
+        int height = h->param.i_height >> shift;
+        int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+        int mbsize = 16>>shift;
          pixel *fenc = h->fenc->plane[i] + mbsize * mb_x;
          for( int y = height; y < height + pady; y++ )
              memcpy( fenc + y*stride,
diff --git a/common/frame.h b/common/frame.h

index 313e4bbecff9686df257844ee02ee9ceba2ddb52..db066f3971cac246c65ccfa2437dd0bd9f45c18c 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -65,16 +65,16 @@ typedef struct x264_frame
  
      /* YUV buffer */
      int     i_plane;
-    int     i_stride[2];
-    int     i_width[2];
-    int     i_lines[2];
+    int     i_stride[3];
+    int     i_width[3];
+    int     i_lines[3];
      int     i_stride_lowres;
      int     i_width_lowres;
      int     i_lines_lowres;
-    pixel *plane[2];
-    pixel *plane_fld[2];
-    pixel *filtered[4]; /* plane[0], H, V, HV */
-    pixel *filtered_fld[4];
+    pixel *plane[3];
+    pixel *plane_fld[3];
+    pixel *filtered[3][4]; /* plane[0], H, V, HV */
+    pixel *filtered_fld[3][4];
      pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
      uint16_t *integral;
  
@@ -187,7 +187,6 @@ typedef struct
                                 int bframe, x264_t *h );
  } x264_deblock_function_t;
  
-x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
  void          x264_frame_delete( x264_frame_t *frame );
  
  int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
diff --git a/common/macroblock.c b/common/macroblock.c

index 8ca8eca163de86d71aa51a4bf5c52020fc68eff4..0fb7d12c7e89c285a9f447ac2f9d31c694b6b81a 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -28,6 +28,12 @@
  #include "common.h"
  #include "encoder/me.h"
  
+#define MC_LUMA(list,p) \
+    h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
+                   &h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \
+                   mvx, mvy, 4*width, 4*height, \
+                   list ? weight_none : &h->sh.weight[i_ref][p] );
+
  static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
  {
      int i8    = x264_scan8[0]+x+8*y;
@@ -35,28 +41,33 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
      int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
      int mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
  
-    h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
-                   h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
-                   mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] );
-
-    // chroma is offset if MCing from a field of opposite parity
-    if( MB_INTERLACED & i_ref )
-        mvy += (h->mb.i_mb_y & 1)*4 - 2;
-
-    h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                     &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
-                     mvx, mvy, 2*width, 2*height );
-
-    if( h->sh.weight[i_ref][1].weightfn )
-        h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                   &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                   &h->sh.weight[i_ref][1], height*2 );
-    if( h->sh.weight[i_ref][2].weightfn )
-        h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                   &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                                                   &h->sh.weight[i_ref][2],height*2 );
+    MC_LUMA( 0, 0 );
  
+    if( CHROMA444 )
+    {
+        MC_LUMA( 0, 1 );
+        MC_LUMA( 0, 2 );
+    }
+    else
+    {
+        // chroma is offset if MCing from a field of opposite parity
+        if( MB_INTERLACED & i_ref )
+            mvy += (h->mb.i_mb_y & 1)*4 - 2;
+
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
+                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                         h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
+                         mvx, mvy, 2*width, 2*height );
+
+        if( h->sh.weight[i_ref][1].weightfn )
+            h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][1], height*2 );
+        if( h->sh.weight[i_ref][2].weightfn )
+            h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                       &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                       &h->sh.weight[i_ref][2],height*2 );
+    }
  }
  static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
  {
@@ -65,19 +76,33 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
      int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
      int mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
  
-    h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
-                   h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
-                   mvx, mvy, 4*width, 4*height, weight_none );
+    MC_LUMA( 1, 0 );
  
-    if( MB_INTERLACED & i_ref )
-        mvy += (h->mb.i_mb_y & 1)*4 - 2;
+    if( CHROMA444 )
+    {
+        MC_LUMA( 1, 1 );
+        MC_LUMA( 1, 2 );
+    }
+    else
+    {
+        if( MB_INTERLACED & i_ref )
+            mvy += (h->mb.i_mb_y & 1)*4 - 2;
  
-    h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
-                     &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
-                     mvx, mvy, 2*width, 2*height );
+        h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
+                         &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                         h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
+                         mvx, mvy, 2*width, 2*height );
+    }
  }
  
+#define MC_LUMA_BI(p) \
+    src0 = h->mc.get_ref( tmp0, &i_stride0, &h->mb.pic.p_fref[0][i_ref0][p*4], h->mb.pic.i_stride[p], \
+                          mvx0, mvy0, 4*width, 4*height, weight_none ); \
+    src1 = h->mc.get_ref( tmp1, &i_stride1, &h->mb.pic.p_fref[1][i_ref1][p*4], h->mb.pic.i_stride[p], \
+                          mvx1, mvy1, 4*width, 4*height, weight_none ); \
+    h->mc.avg[i_mode]( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
+                       src0, i_stride0, src1, i_stride1, weight );
+
  static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
  {
      int i8 = x264_scan8[0]+x+8*y;
@@ -94,26 +119,32 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
      ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
      pixel *src0, *src1;
  
-    src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
-                          mvx0, mvy0, 4*width, 4*height, weight_none );
-    src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
-                          mvx1, mvy1, 4*width, 4*height, weight_none );
-    h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
-                       src0, i_stride0, src1, i_stride1, weight );
+    MC_LUMA_BI( 0 );
  
-    if( MB_INTERLACED & i_ref0 )
-        mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
-    if( MB_INTERLACED & i_ref1 )
-        mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
-
-    h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
-                     mvx0, mvy0, 2*width, 2*height );
-    h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
-                     mvx1, mvy1, 2*width, 2*height );
-    h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
-    h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
+    if( CHROMA444 )
+    {
+        MC_LUMA_BI( 1 );
+        MC_LUMA_BI( 2 );
+    }
+    else
+    {
+        if( MB_INTERLACED & i_ref0 )
+            mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
+        if( MB_INTERLACED & i_ref1 )
+            mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
+
+        h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
+                         mvx0, mvy0, 2*width, 2*height );
+        h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
+                         mvx1, mvy1, 2*width, 2*height );
+        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
+        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
+    }
  }
  
+#undef MC_LUMA
+#undef MC_LUMA_BI
+
  void x264_mb_mc_8x8( x264_t *h, int i8 )
  {
      int x = 2*(i8&1);
@@ -225,7 +256,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
      CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
  
      /* all coeffs */
-    CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
+    CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
  
      if( h->param.b_cabac )
      {
@@ -316,7 +347,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
      if( !b_lookahead )
      {
          for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
-            for( int j = 0; j < 2; j++ )
+            for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
              {
                  /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
                  CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
@@ -361,7 +392,7 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
          for( int i = 0; i <= PARAM_INTERLACED; i++ )
              x264_free( h->deblock_strength[i] );
          for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
-            for( int j = 0; j < 2; j++ )
+            for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
                  x264_free( h->intra_border_backup[i][j] - 16 );
      }
      x264_free( h->scratch_buffer );
@@ -460,6 +491,15 @@ void x264_macroblock_thread_init( x264_t *h )
                            (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
      h->mb.i_mb_prev_xy = -1;
  
+    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
+    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
+    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+    if( CHROMA444 )
+    {
+        h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
+        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
+    }
      /* fdec:      fenc:
       * yyyyyyy
       * yYYYY      YYYY
@@ -470,12 +510,11 @@ void x264_macroblock_thread_init( x264_t *h )
       * uUU vVV    UUVV
       * uUU vVV
       */
-    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
-    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
-    h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
-    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
-    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
-    h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+    else
+    {
+        h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
+        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+    }
  }
  
  void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
@@ -495,24 +534,25 @@ NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
          dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
  }
  
-static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_mbaff )
+static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
  {
-    int w = (i ? 8 : 16);
+    int mb_interlaced = b_mbaff && MB_INTERLACED;
+    int w = b_chroma ? 8 : 16;
      int i_stride = h->fdec->i_stride[i];
-    int i_stride2 = i_stride << MB_INTERLACED;
-    int i_pix_offset = MB_INTERLACED
+    int i_stride2 = i_stride << mb_interlaced;
+    int i_pix_offset = mb_interlaced
                       ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                       : 16 * mb_x + w * mb_y * i_stride;
      pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-    int fdec_idx = b_mbaff ? (MB_INTERLACED ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
+    int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
      pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
      int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
      /* ref_pix_offset[0] references the current field and [1] the opposite field. */
-    if( MB_INTERLACED )
+    if( mb_interlaced )
          ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride;
      h->mb.pic.i_stride[i] = i_stride2;
      h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
-    if( i )
+    if( b_chroma )
      {
          h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
          memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
@@ -525,78 +565,81 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
      }
      else
      {
-        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 );
-        memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
+        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
+        memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
          if( b_mbaff )
-            h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = intra_fdec[-1];
+            h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
      }
      if( b_mbaff )
      {
          for( int j = 0; j < w; j++ )
-            if( i )
+            if( b_chroma )
              {
                  h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
                  h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
              }
              else
-                h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
+                h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
      }
      pixel *plane_src, **filtered_src;
      for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
      {
          // Interpolate between pixels in same field.
-        if( MB_INTERLACED )
+        if( mb_interlaced )
          {
              plane_src = h->fref[0][j>>1]->plane_fld[i];
-            filtered_src = h->fref[0][j>>1]->filtered_fld;
+            filtered_src = h->fref[0][j>>1]->filtered_fld[i];
          }
          else
          {
              plane_src = h->fref[0][j]->plane[i];
-            filtered_src = h->fref[0][j]->filtered;
+            filtered_src = h->fref[0][j]->filtered[i];
          }
-        h->mb.pic.p_fref[0][j][i?4:0] = plane_src + ref_pix_offset[j&1];
+        h->mb.pic.p_fref[0][j][i*4] = plane_src + ref_pix_offset[j&1];
  
-        if( !i )
+        if( !b_chroma )
          {
              for( int k = 1; k < 4; k++ )
-                h->mb.pic.p_fref[0][j][k] = filtered_src[k] + ref_pix_offset[j&1];
-            if( h->sh.weight[j][0].weightfn )
-                h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> MB_INTERLACED][ref_pix_offset[j&1]];
-            else
-                h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
+                h->mb.pic.p_fref[0][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1];
+            if( !i )
+            {
+                if( h->sh.weight[j][0].weightfn )
+                    h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> mb_interlaced][ref_pix_offset[j&1]];
+                else
+                    h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
+            }
          }
      }
      if( h->sh.i_type == SLICE_TYPE_B )
          for( int j = 0; j < h->mb.pic.i_fref[1]; j++ )
          {
-            if( MB_INTERLACED )
+            if( mb_interlaced )
              {
                  plane_src = h->fref[1][j>>1]->plane_fld[i];
-                filtered_src = h->fref[1][j>>1]->filtered_fld;
+                filtered_src = h->fref[1][j>>1]->filtered_fld[i];
              }
              else
              {
                  plane_src = h->fref[1][j]->plane[i];
-                filtered_src = h->fref[1][j]->filtered;
+                filtered_src = h->fref[1][j]->filtered[i];
              }
-            h->mb.pic.p_fref[1][j][i?4:0] = plane_src + ref_pix_offset[j&1];
+            h->mb.pic.p_fref[1][j][i*4] = plane_src + ref_pix_offset[j&1];
  
-            if( !i )
+            if( !b_chroma )
                  for( int k = 1; k < 4; k++ )
-                    h->mb.pic.p_fref[1][j][k] = filtered_src[k] + ref_pix_offset[j&1];
+                    h->mb.pic.p_fref[1][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1];
          }
  }
  
  static const x264_left_table_t left_indices[4] =
  {
      /* Current is progressive */
-    {{ 4, 4, 5, 5}, { 3,  3,  7,  7}, {16+1, 16+1, 16+4+1, 16+4+1}, {0, 0, 1, 1}, {0, 0, 0, 0}},
-    {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+3, 16+3, 16+4+3, 16+4+3}, {2, 2, 3, 3}, {1, 1, 1, 1}},
+    {{ 4, 4, 5, 5}, { 3,  3,  7,  7}, {16+1, 16+1, 32+1, 32+1}, {0, 0, 1, 1}, {0, 0, 0, 0}},
+    {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+5, 16+5, 32+5, 32+5}, {2, 2, 3, 3}, {1, 1, 1, 1}},
      /* Current is interlaced */
-    {{ 4, 6, 4, 6}, { 3, 11,  3, 11}, {16+1, 16+1, 16+4+1, 16+4+1}, {0, 2, 0, 2}, {0, 1, 0, 1}},
+    {{ 4, 6, 4, 6}, { 3, 11,  3, 11}, {16+1, 16+1, 32+1, 32+1}, {0, 2, 0, 2}, {0, 1, 0, 1}},
      /* Both same */
-    {{ 4, 5, 6, 3}, { 3,  7, 11, 15}, {16+1, 16+3, 16+4+1, 16+4+3}, {0, 1, 2, 3}, {0, 0, 1, 1}}
+    {{ 4, 5, 6, 3}, { 3,  7, 11, 15}, {16+1, 16+5, 32+1, 32+5}, {0, 1, 2, 3}, {0, 0, 1, 1}}
  };
  
  static void ALWAYS_INLINE x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y, int b_interlaced )
@@ -797,7 +840,7 @@ void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, in
      /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */
      /* By only dereferencing them once, we avoid this issue. */
      int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode;
-    uint8_t (*nnz)[24] = h->mb.non_zero_count;
+    uint8_t (*nnz)[48] = h->mb.non_zero_count;
      int16_t *cbp = h->mb.cbp;
  
      const x264_left_table_t *left_index_table = h->mb.left_index_table;
@@ -810,10 +853,9 @@ void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, in
          CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] );
  
          /* load non_zero_count */
-        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[top][12] );
-        /* shift because x264_scan8[16] is misaligned */
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &nnz[top][18] ) << 8;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &nnz[top][22] ) << 8;
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16+4 + 8*CHROMA444] );
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32+4 + 8*CHROMA444] );
  
          /* Finish the prefetching */
          for( int l = 0; l < lists; l++ )
@@ -834,79 +876,80 @@ void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, in
          M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;
  
          /* load non_zero_count */
-        M32( &h->mb.cache.non_zero_count[x264_scan8[   0] - 8] ) = 0x80808080U;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = 0x80808080U;
-        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8] ) = 0x80808080U;
      }
  
      if( h->mb.i_neighbour & MB_LEFT )
      {
+        int ltop = left[LTOP];
+        int lbot = b_mbaff ? left[LBOT] : ltop;
          if( b_mbaff )
          {
-            const int16_t top_luma = (cbp[left[LTOP]] >> (left_index_table->mv[0]&(~1))) & 2;
-            const int16_t bot_luma = (cbp[left[LBOT]] >> (left_index_table->mv[2]&(~1))) & 2;
-            h->mb.cache.i_cbp_left = (cbp[left[LTOP]] & 0xfff0) | (bot_luma<<2) | top_luma;
+            const int16_t top_luma = (cbp[ltop] >> (left_index_table->mv[0]&(~1))) & 2;
+            const int16_t bot_luma = (cbp[lbot] >> (left_index_table->mv[2]&(~1))) & 2;
+            h->mb.cache.i_cbp_left = (cbp[ltop] & 0xfff0) | (bot_luma<<2) | top_luma;
          }
          else
-            h->mb.cache.i_cbp_left = cbp[left[0]];
-        if( b_mbaff )
+            h->mb.cache.i_cbp_left = cbp[ltop];
+
+        /* load intra4x4 */
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = i4x4[ltop][left_index_table->intra[0]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = i4x4[ltop][left_index_table->intra[1]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = i4x4[lbot][left_index_table->intra[2]];
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[lbot][left_index_table->intra[3]];
+
+        /* load non_zero_count */
+        h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = nnz[ltop][left_index_table->nnz[0]];
+        h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = nnz[ltop][left_index_table->nnz[1]];
+        h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
+        h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];
+
+        if( CHROMA444 )
          {
-            /* load intra4x4 */
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[left[LTOP]][left_index_table->intra[0]];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[left[LTOP]][left_index_table->intra[1]];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[left[LBOT]][left_index_table->intra[2]];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[left[LBOT]][left_index_table->intra[3]];
-
-            /* load non_zero_count */
-            h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[LTOP]][left_index_table->nnz[0]];
-            h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[LTOP]][left_index_table->nnz[1]];
-            h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[LBOT]][left_index_table->nnz[2]];
-            h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[LBOT]][left_index_table->nnz[3]];
-
-            h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[left[LTOP]][left_index_table->nnz_chroma[0]];
-            h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[left[LBOT]][left_index_table->nnz_chroma[1]];
-
-            h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left[LTOP]][left_index_table->nnz_chroma[2]];
-            h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left[LBOT]][left_index_table->nnz_chroma[3]];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16];
+            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32];
+            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32];
          }
          else
          {
-            int l = left[0];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = i4x4[l][left_index_table->intra[0]];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = i4x4[l][left_index_table->intra[1]];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] = i4x4[l][left_index_table->intra[2]];
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[l][left_index_table->intra[3]];
-
-            h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[l][left_index_table->nnz[0]];
-            h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[l][left_index_table->nnz[1]];
-            h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[l][left_index_table->nnz[2]];
-            h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[l][left_index_table->nnz[3]];
-
-            h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] = nnz[l][left_index_table->nnz_chroma[0]];
-            h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] = nnz[l][left_index_table->nnz_chroma[1]];
-
-            h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[l][left_index_table->nnz_chroma[2]];
-            h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[l][left_index_table->nnz_chroma[3]];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[0]];
+            h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[1]];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[2]];
+            h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[3]];
          }
      }
      else
      {
          h->mb.cache.i_cbp_left = -1;
  
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[8 ] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] =
+        h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] =
          h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1;
  
          /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] =
          h->mb.cache.non_zero_count[x264_scan8[10] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[16+2] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] =
-        h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = 0x80;
+        h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
+        h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
+        if( CHROMA444 )
+        {
+            h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
+            h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
+            h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] =
+            h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = 0x80;
+        }
      }
  
      if( h->pps->b_transform_8x8_mode )
@@ -926,15 +969,33 @@ void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, in
      {
          x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
          x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
-        x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
-        x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
-        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0 );
-        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0 );
+        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 0 );
+        if( CHROMA444 )
+        {
+            x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+ 4*FDEC_STRIDE );
+            x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+12*FDEC_STRIDE );
+            x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+ 4*FDEC_STRIDE );
+            x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+12*FDEC_STRIDE );
+            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 0 );
+            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 0 );
+        }
+        else
+        {
+            x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
+            x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
+            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
+        }
      }
      else
      {
-        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 1 );
-        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1 );
+        x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 1 );
+        if( CHROMA444 )
+        {
+            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 1 );
+            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 1 );
+        }
+        else
+            x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 1 );
      }
  
      if( h->fdec->integral )
@@ -1342,7 +1403,7 @@ void x264_macroblock_deblock_strength( x264_t *h )
              int s8x8 = h->mb.i_b8_stride;
              int s4x4 = h->mb.i_b4_stride;
  
-            uint8_t (*nnz)[24] = h->mb.non_zero_count;
+            uint8_t (*nnz)[48] = h->mb.non_zero_count;
              const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3];
  
              if( h->mb.i_neighbour & MB_TOP )
@@ -1421,7 +1482,7 @@ void x264_macroblock_deblock_strength( x264_t *h )
      /* Munge NNZ for cavlc + 8x8dct */
      if( !h->param.b_cabac && h->pps->b_transform_8x8_mode )
      {
-        uint8_t (*nnz)[24] = h->mb.non_zero_count;
+        uint8_t (*nnz)[48] = h->mb.non_zero_count;
          int top = h->mb.i_mb_top_xy;
          int *left = h->mb.i_mb_left_xy;
  
@@ -1472,18 +1533,18 @@ void x264_macroblock_deblock_strength( x264_t *h )
                                 bs, mvy_limit, h->sh.i_type == SLICE_TYPE_B, h );
  }
  
-static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_mbaff )
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
  {
-    int w = i ? 8 : 16;
+    int w = b_chroma ? 8 : 16;
      int i_stride = h->fdec->i_stride[i];
      int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
      int i_pix_offset = (b_mbaff && MB_INTERLACED)
                       ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
                       : 16 * mb_x + w * mb_y * i_stride;
-    if( i )
+    if( b_chroma )
          h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
      else
-        h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
+        h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
  }
  
  static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int mb_y, int b_mbaff )
@@ -1494,8 +1555,16 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
       * mbpair in intra_border_backup[2]. */
      int backup_dst = !b_mbaff ? 0 : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
      memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
-    memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7,   8*sizeof(pixel) );
-    memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7,   8*sizeof(pixel) );
+    if( CHROMA444 )
+    {
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*sizeof(pixel) );
+        memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*sizeof(pixel) );
+    }
+    else
+    {
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7,   8*sizeof(pixel) );
+        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7,   8*sizeof(pixel) );
+    }
      if( b_mbaff )
      {
          if( mb_y&1 )
@@ -1503,9 +1572,17 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
              int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE;
              backup_dst = MB_INTERLACED ? 2 : 0;
              memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+backup_src, 16*sizeof(pixel) );
-            backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
-            memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
-            memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
+            if( CHROMA444 )
+            {
+                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 16*sizeof(pixel) );
+                memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+backup_src, 16*sizeof(pixel) );
+            }
+            else
+            {
+                backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
+                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*sizeof(pixel) );
+                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*sizeof(pixel) );
+            }
          }
      }
      else
@@ -1513,8 +1590,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
          /* In progressive we update intra_border_backup in-place, so the topleft neighbor will
           * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
          h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
-        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7];
-        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7];
+        h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7 + 8*CHROMA444];
+        h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7 + 8*CHROMA444];
      }
  }
  
@@ -1535,14 +1612,26 @@ void x264_macroblock_cache_save( x264_t *h )
      if( SLICE_MBAFF )
      {
          x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 );
-        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 1 );
-        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1 );
+        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 1 );
+        if( CHROMA444 )
+        {
+            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 1 );
+            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 1 );
+        }
+        else
+            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 1 );
      }
      else
      {
          x264_macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 );
-        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0 );
-        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0 );
+        x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 0 );
+        if( CHROMA444 )
+        {
+            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 0 );
+            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 0 );
+        }
+        else
+            x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 0 );
      }
  
      x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
@@ -1570,11 +1659,11 @@ void x264_macroblock_cache_save( x264_t *h )
      {
          h->mb.qp[i_mb_xy] = 0;
          h->mb.i_last_dqp = 0;
-        h->mb.i_cbp_chroma = 2;
+        h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2;
          h->mb.i_cbp_luma = 0xf;
-        h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
+        h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700;
          h->mb.b_transform_8x8 = 0;
-        for( int i = 0; i < 24; i++ )
+        for( int i = 0; i < 48; i++ )
              h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16;
      }
      else
@@ -1587,14 +1676,21 @@ void x264_macroblock_cache_save( x264_t *h )
      }
  
      /* save non zero count */
-    CP32( &nnz[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
-    CP32( &nnz[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
-    CP32( &nnz[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
-    CP32( &nnz[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
-    M16( &nnz[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
-    M16( &nnz[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
-    M16( &nnz[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
-    M16( &nnz[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
+    CP32( &nnz[ 0+0*4], &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+    CP32( &nnz[ 0+1*4], &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+    CP32( &nnz[ 0+2*4], &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+    CP32( &nnz[ 0+3*4], &h->mb.cache.non_zero_count[x264_scan8[10]] );
+    CP32( &nnz[16+0*4], &h->mb.cache.non_zero_count[x264_scan8[16+0]] );
+    CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
+    CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
+    CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
+    if( CHROMA444 )
+    {
+        CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
+        CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
+        CP32( &nnz[32+2*4], &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] );
+        CP32( &nnz[32+3*4], &h->mb.cache.non_zero_count[x264_scan8[32+10]] );
+    }
  
      if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 )
          h->mb.b_transform_8x8 = 0;
diff --git a/common/macroblock.h b/common/macroblock.h

index feb682e503cca3838fe0067738076dfc3f5165fe..278031f6bf5cfbeeb93821c1970974b975f003bb 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -272,12 +272,30 @@ static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
  
  enum cabac_ctx_block_cat_e
  {
-    DCT_LUMA_DC   = 0,
-    DCT_LUMA_AC   = 1,
-    DCT_LUMA_4x4  = 2,
-    DCT_CHROMA_DC = 3,
-    DCT_CHROMA_AC = 4,
-    DCT_LUMA_8x8  = 5,
+    DCT_LUMA_DC     = 0,
+    DCT_LUMA_AC     = 1,
+    DCT_LUMA_4x4    = 2,
+    DCT_CHROMA_DC   = 3,
+    DCT_CHROMA_AC   = 4,
+    DCT_LUMA_8x8    = 5,
+    DCT_CHROMAU_DC  = 6,
+    DCT_CHROMAU_AC  = 7,
+    DCT_CHROMAU_4x4 = 8,
+    DCT_CHROMAU_8x8 = 9,
+    DCT_CHROMAV_DC  = 10,
+    DCT_CHROMAV_AC  = 11,
+    DCT_CHROMAV_4x4 = 12,
+    DCT_CHROMAV_8x8 = 13,
+};
+
+static const uint8_t ctx_cat_plane[6][3] =
+{
+    { DCT_LUMA_DC,  DCT_CHROMAU_DC,  DCT_CHROMAV_DC},
+    { DCT_LUMA_AC,  DCT_CHROMAU_AC,  DCT_CHROMAV_AC},
+    {DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4},
+    {0},
+    {0},
+    {DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8}
  };
  
  /* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
diff --git a/common/mc.c b/common/mc.c

index 92169236ce32527525321547020c8499d14fc4e9..8c4c2b9e3cd622f6111e77f0cdae596fc32926d9 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -512,40 +512,44 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
  void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
  {
      const int b_interlaced = PARAM_INTERLACED;
-    int stride = frame->i_stride[0];
-    const int width = frame->i_width[0];
      int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
      int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8;
-    int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
  
      if( mb_y & b_interlaced )
          return;
  
-    if( !b_interlaced || h->mb.b_adaptive_mbaff )
-        h->mc.hpel_filter(
-            frame->filtered[1] + offs,
-            frame->filtered[2] + offs,
-            frame->filtered[3] + offs,
-            frame->plane[0] + offs,
-            stride, width + 16, height - start,
-            h->scratch_buffer );
-
-    if( b_interlaced )
+    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
      {
-        /* MC must happen between pixels in the same field. */
-        stride = frame->i_stride[0] << 1;
-        start = (mb_y*16 >> 1) - 8;
-        int height_fld = ((b_end ? frame->i_lines[0] : mb_y*16) >> 1) + 8;
-        offs = start*stride - 8;
-        for( int i = 0; i < 2; i++, offs += frame->i_stride[0] )
-        {
+        int stride = frame->i_stride[p];
+        const int width = frame->i_width[p];
+        int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
+
+        if( !b_interlaced || h->mb.b_adaptive_mbaff )
              h->mc.hpel_filter(
-                frame->filtered_fld[1] + offs,
-                frame->filtered_fld[2] + offs,
-                frame->filtered_fld[3] + offs,
-                frame->plane_fld[0] + offs,
-                stride, width + 16, height_fld - start,
+                frame->filtered[p][1] + offs,
+                frame->filtered[p][2] + offs,
+                frame->filtered[p][3] + offs,
+                frame->plane[p] + offs,
+                stride, width + 16, height - start,
                  h->scratch_buffer );
+
+        if( b_interlaced )
+        {
+            /* MC must happen between pixels in the same field. */
+            stride = frame->i_stride[p] << 1;
+            start = (mb_y*16 >> 1) - 8;
+            int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8;
+            offs = start*stride - 8;
+            for( int i = 0; i < 2; i++, offs += frame->i_stride[p] )
+            {
+                h->mc.hpel_filter(
+                    frame->filtered_fld[p][1] + offs,
+                    frame->filtered_fld[p][2] + offs,
+                    frame->filtered_fld[p][3] + offs,
+                    frame->plane_fld[p] + offs,
+                    stride, width + 16, height_fld - start,
+                    h->scratch_buffer );
+            }
          }
      }
  
@@ -556,6 +560,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
  
      if( frame->integral )
      {
+        int stride = frame->i_stride[0];
          if( start < 0 )
          {
              memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
diff --git a/common/quant.c b/common/quant.c

index cc4ea8649da84894b4973fb4901822d41843d3d8..dc7e8ae740e03a96f7bc7a1403d8b95aa14ae039 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -581,8 +581,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
      }
  #endif
  #endif // HIGH_BIT_DEPTH
-    pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
-    pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
-    pf->coeff_level_run[  DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
-    pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[ DCT_LUMA_AC];
+    pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
+    pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
+    pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
+    pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
+    pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
+
+    pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
+    pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
+    pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
+    pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
  }
diff --git a/common/quant.h b/common/quant.h

index cf9c8b15a8cc340cef7826a0b54d8cc2875000c3..09364143612fe6e4e60c85bac396203588147351 100644 (file)
--- a/common/quant.h
+++ b/common/quant.h
@@ -45,8 +45,8 @@ typedef struct
      int (*decimate_score15)( dctcoef *dct );
      int (*decimate_score16)( dctcoef *dct );
      int (*decimate_score64)( dctcoef *dct );
-    int (*coeff_last[6])( dctcoef *dct );
-    int (*coeff_level_run[5])( dctcoef *dct, x264_run_level_t *runlevel );
+    int (*coeff_last[14])( dctcoef *dct );
+    int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
  } x264_quant_function_t;
  
  void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
diff --git a/common/set.c b/common/set.c

index f967fd8de28593c7a44dd116877374702854b183..b958c7f13a0949df229588b071573245e6ff3c96 100644 (file)
--- a/common/set.c
+++ b/common/set.c
@@ -78,15 +78,16 @@ int x264_cqm_init( x264_t *h )
      int def_dequant4[6][16];
      int def_dequant8[6][64];
      int quant4_mf[4][6][16];
-    int quant8_mf[2][6][64];
+    int quant8_mf[4][6][64];
      int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
                          32 - h->param.analyse.i_luma_deadzone[0],
                          32 - 11, 32 - 21 };
      int max_qp_err = -1;
      int max_chroma_qp_err = -1;
      int min_qp_err = QP_MAX+1;
+    int num_8x8_lists = CHROMA444 ? 4 : 2;
  
-    for( int i = 0; i < 6; i++ )
+    for( int i = 0; i < 4 + num_8x8_lists; i++ )
      {
          int size = i<4 ? 16 : 64;
          int j;
@@ -140,7 +141,7 @@ int x264_cqm_init( x264_t *h )
                  h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
                       quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
              }
-        for( int i_list = 0; i_list < 2; i_list++ )
+        for( int i_list = 0; i_list < num_8x8_lists; i_list++ )
              for( int i = 0; i < 64; i++ )
              {
                  h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
@@ -168,7 +169,7 @@ int x264_cqm_init( x264_t *h )
                      max_chroma_qp_err = q;
              }
          if( h->param.analyse.b_transform_8x8 )
-            for( int i_list = 0; i_list < 2; i_list++ )
+            for( int i_list = 0; i_list < num_8x8_lists; i_list++ )
                  for( int i = 0; i < 64; i++ )
                  {
                      h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
@@ -181,8 +182,10 @@ int x264_cqm_init( x264_t *h )
                          continue;
                      }
                      h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
-                    if( j > 0xffff && q > max_qp_err )
+                    if( j > 0xffff && q > max_qp_err && (i_list == CQM_8IY || i_list == CQM_8PY) )
                          max_qp_err = q;
+                    if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_8IC || i_list == CQM_8PC) )
+                        max_chroma_qp_err = q;
                  }
      }
  
@@ -190,9 +193,9 @@ int x264_cqm_init( x264_t *h )
      x264_emms();
      CHECKED_MALLOC( h->nr_offset_emergency, sizeof(*h->nr_offset_emergency)*(QP_MAX-QP_MAX_SPEC) );
      for( int q = 0; q < QP_MAX - QP_MAX_SPEC; q++ )
-        for( int cat = 0; cat <= 2; cat++ )
+        for( int cat = 0; cat < 3 + CHROMA444; cat++ )
          {
-            int dct8x8 = cat == 1;
+            int dct8x8 = cat&1;
              int size = dct8x8 ? 64 : 16;
              udctcoef *nr_offset = h->nr_offset_emergency[q][cat];
              /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */
@@ -210,7 +213,7 @@ int x264_cqm_init( x264_t *h )
                      continue;
                  }
  
-                int thresh = i == 0 ? dc_threshold : cat == 2 ? chroma_threshold : luma_threshold;
+                int thresh = i == 0 ? dc_threshold : cat >= 2 ? chroma_threshold : luma_threshold;
                  if( q < thresh )
                  {
                      nr_offset[i] = 0;
@@ -253,7 +256,7 @@ fail:
  }
  
  #define CQM_DELETE( n, max )\
-    for( int i = 0; i < max; i++ )\
+    for( int i = 0; i < (max); i++ )\
      {\
          int j;\
          for( j = 0; j < i; j++ )\
@@ -275,12 +278,12 @@ fail:
  void x264_cqm_delete( x264_t *h )
  {
      CQM_DELETE( 4, 4 );
-    CQM_DELETE( 8, 2 );
+    CQM_DELETE( 8, CHROMA444 ? 4 : 2 );
      x264_free( h->nr_offset_emergency );
  }
  
  static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
-                           uint8_t *cqm, const uint8_t *jvt, int length )
+                                  uint8_t *cqm, const uint8_t *jvt, int length )
  {
      int i;
  
@@ -341,11 +344,16 @@ int x264_cqm_parse_file( x264_t *h, const char *filename )
          memset( p, ' ', strcspn( p, "\n" ) );
  
      b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_LUMA",   h->param.cqm_4iy, x264_cqm_jvt4i, 16 );
-    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_CHROMA", h->param.cqm_4ic, x264_cqm_jvt4i, 16 );
      b_error |= x264_cqm_parse_jmlist( h, buf, "INTER4X4_LUMA",   h->param.cqm_4py, x264_cqm_jvt4p, 16 );
+    b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA4X4_CHROMA", h->param.cqm_4ic, x264_cqm_jvt4i, 16 );
      b_error |= x264_cqm_parse_jmlist( h, buf, "INTER4X4_CHROMA", h->param.cqm_4pc, x264_cqm_jvt4p, 16 );
      b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_LUMA",   h->param.cqm_8iy, x264_cqm_jvt8i, 64 );
      b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_LUMA",   h->param.cqm_8py, x264_cqm_jvt8p, 64 );
+    if( CHROMA444 )
+    {
+        b_error |= x264_cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8iy, x264_cqm_jvt8i, 64 );
+        b_error |= x264_cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 );
+    }
  
      x264_free( buf );
      return b_error;
diff --git a/common/set.h b/common/set.h

index aeeb36c472fc6528c48c44dfad459fd971bd2408..4bbfea6e39ca70d9695151081c44a3fa0fa1d539 100644 (file)
--- a/common/set.h
+++ b/common/set.h
@@ -49,7 +49,9 @@ enum cqm4_e
  enum cqm8_e
  {
      CQM_8IY = 0,
-    CQM_8PY = 1
+    CQM_8PY = 1,
+    CQM_8IC = 2,
+    CQM_8PC = 3,
  };
  
  typedef struct
@@ -148,6 +150,7 @@ typedef struct
      } vui;
  
      int b_qpprime_y_zero_transform_bypass;
+    int i_chroma_format_idc;
  
  } x264_sps_t;
  
@@ -179,7 +182,7 @@ typedef struct
      int b_transform_8x8_mode;
  
      int i_cqm_preset;
-    const uint8_t *scaling_list[6]; /* could be 8, but we don't allow separate Cb/Cr lists */
+    const uint8_t *scaling_list[8]; /* could be 12, but we don't allow separate Cb/Cr lists */
  
  } x264_pps_t;
  
@@ -231,10 +234,11 @@ static const uint8_t x264_cqm_flat16[64] =
      16,16,16,16,16,16,16,16,
      16,16,16,16,16,16,16,16
  };
-static const uint8_t * const x264_cqm_jvt[6] =
+static const uint8_t * const x264_cqm_jvt[8] =
  {
      x264_cqm_jvt4i, x264_cqm_jvt4p,
      x264_cqm_jvt4i, x264_cqm_jvt4p,
+    x264_cqm_jvt8i, x264_cqm_jvt8p,
      x264_cqm_jvt8i, x264_cqm_jvt8p
  };
  
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index 55b4eb1f9a98e232f57cb0b1fe5010ea36370c74..393e197de422b1a7533041230ce984c62ed68af5 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -55,7 +55,7 @@ struc cb
      .end: pointer 1
      align 16, resb 1
      .bits_encoded: resd 1
-    .state: resb 460
+    .state: resb 1024
  endstruc
  
  %macro LOAD_GLOBAL 4
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 15b620c6a2b4bb7f056f41db153aacb71bf4e619..21d4421faac5575553ae39c74c3a5b528ab8628b 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -342,7 +342,7 @@ void x264_analyse_weight_frame( x264_t *h, int end )
              int width = frame->i_width[0] + 2*PADH;
              int i_padv = PADV << PARAM_INTERLACED;
              int offset, height;
-            pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
              height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
              offset = h->fenc->i_lines_weighted*frame->i_stride[0];
              h->fenc->i_lines_weighted += height;
@@ -687,6 +687,22 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
      if( a->i_satd_i8x8chroma < COST_MAX )
          return;
  
+    if( CHROMA444 )
+    {
+        if( !h->mb.b_chroma_me )
+        {
+            a->i_satd_i8x8chroma = 0;
+            return;
+        }
+
+        /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
+        h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
+        h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
+        a->i_satd_i8x8chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
+                             + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+        return;
+    }
+
      const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
  
      /* 8x8 prediction selection for chroma */
@@ -738,6 +754,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
      h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
  }
  
+/* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
  static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
  {
      const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
@@ -794,7 +811,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
              int i_mode = *predict_mode;
  
              if( h->mb.b_lossless )
-                x264_predict_lossless_16x16( h, i_mode );
+                x264_predict_lossless_16x16( h, 0, i_mode );
              else
                  h->predict_16x16[i_mode]( p_dst );
  
@@ -865,7 +882,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                  int i_mode = *predict_mode;
  
                  if( h->mb.b_lossless )
-                    x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
+                    x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
                  else
                      h->predict_8x8[i_mode]( p_dst_by, edge );
  
@@ -882,8 +899,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                  break;
  
              /* we need to encode this block now (for next ones) */
-            h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
-            x264_mb_encode_i8x8( h, idx, a->i_qp );
+            x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge );
  
              x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
          }
@@ -966,7 +982,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                      int i_mode = *predict_mode;
  
                      if( h->mb.b_lossless )
-                        x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
+                        x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
                      else
                          h->predict_4x4[i_mode]( p_dst_by );
  
@@ -991,8 +1007,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                  break;
  
              /* we need to encode this block now (for next ones) */
-            h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
-            x264_mb_encode_i4x4( h, idx, a->i_qp );
+            x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx] );
  
              h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
          }
@@ -1049,8 +1064,8 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
  
  static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  {
-    pixel *p_dst = h->mb.pic.p_fdec[0];
      uint64_t i_satd, i_best;
+    int plane_count = CHROMA444 ? 3 : 1;
      h->mb.i_skip_intra = 0;
  
      if( h->mb.i_type == I_16x16 )
@@ -1071,114 +1086,124 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
      }
  
      /* RD selection for chroma prediction */
-    const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
-    if( predict_mode[1] >= 0 )
+    if( !CHROMA444 )
      {
-        int8_t predict_mode_sorted[4];
-        int i_max;
-        int i_thresh = a->i_satd_i8x8chroma * 5/4;
-
-        for( i_max = 0; *predict_mode >= 0; predict_mode++ )
+        const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+        if( predict_mode[1] >= 0 )
          {
-            int i_mode = *predict_mode;
-            if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
-                predict_mode_sorted[i_max++] = i_mode;
-        }
+            int8_t predict_mode_sorted[4];
+            int i_max;
+            int i_thresh = a->i_satd_i8x8chroma * 5/4;
  
-        if( i_max > 0 )
-        {
-            int i_cbp_chroma_best = h->mb.i_cbp_chroma;
-            int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
-            /* the previous thing encoded was x264_intra_rd(), so the pixels and
-             * coefs for the current chroma mode are still around, so we only
-             * have to recount the bits. */
-            i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
-            for( int i = 0; i < i_max; i++ )
+            for( i_max = 0; *predict_mode >= 0; predict_mode++ )
              {
-                int i_mode = predict_mode_sorted[i];
-                if( h->mb.b_lossless )
-                    x264_predict_lossless_8x8_chroma( h, i_mode );
-                else
+                int i_mode = *predict_mode;
+                if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
+                    predict_mode_sorted[i_max++] = i_mode;
+            }
+
+            if( i_max > 0 )
+            {
+                int i_cbp_chroma_best = h->mb.i_cbp_chroma;
+                int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
+                /* the previous thing encoded was x264_intra_rd(), so the pixels and
+                 * coefs for the current chroma mode are still around, so we only
+                 * have to recount the bits. */
+                i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
+                for( int i = 0; i < i_max; i++ )
                  {
-                    h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-                    h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                    int i_mode = predict_mode_sorted[i];
+                    if( h->mb.b_lossless )
+                        x264_predict_lossless_8x8_chroma( h, i_mode );
+                    else
+                    {
+                        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+                        h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+                    }
+                    /* if we've already found a mode that needs no residual, then
+                     * probably any mode with a residual will be worse.
+                     * so avoid dct on the remaining modes to improve speed. */
+                    i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
+                    COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
                  }
-                /* if we've already found a mode that needs no residual, then
-                 * probably any mode with a residual will be worse.
-                 * so avoid dct on the remaining modes to improve speed. */
-                i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
-                COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
+                h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
+                h->mb.i_cbp_chroma = i_cbp_chroma_best;
              }
-            h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
-            h->mb.i_cbp_chroma = i_cbp_chroma_best;
          }
      }
  
      if( h->mb.i_type == I_4x4 )
      {
-        pixel4 pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
-        int i_nnz = 0;
+        pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
+        int nnz[3] = {0};
          for( int idx = 0; idx < 16; idx++ )
          {
-            pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
+            pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
+                             h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
+                             h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
              i_best = COST_MAX64;
  
-            predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
+            const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
  
              if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
-                /* emulate missing topright samples */
-                MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
+                for( int p = 0; p < plane_count; p++ )
+                    /* emulate missing topright samples */
+                    MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
  
              for( ; *predict_mode >= 0; predict_mode++ )
              {
                  int i_mode = *predict_mode;
-                if( h->mb.b_lossless )
-                    x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
-                else
-                    h->predict_4x4[i_mode]( p_dst_by );
                  i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
  
                  if( i_best > i_satd )
                  {
                      a->i_predict4x4[idx] = i_mode;
                      i_best = i_satd;
-                    pels[0] = MPIXEL_X4( p_dst_by+0*FDEC_STRIDE );
-                    pels[1] = MPIXEL_X4( p_dst_by+1*FDEC_STRIDE );
-                    pels[2] = MPIXEL_X4( p_dst_by+2*FDEC_STRIDE );
-                    pels[3] = MPIXEL_X4( p_dst_by+3*FDEC_STRIDE );
-                    i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
+                    for( int p = 0; p < plane_count; p++ )
+                    {
+                        pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
+                        pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
+                        pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
+                        pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
+                        nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
+                    }
                  }
              }
  
-            MPIXEL_X4( p_dst_by+0*FDEC_STRIDE ) = pels[0];
-            MPIXEL_X4( p_dst_by+1*FDEC_STRIDE ) = pels[1];
-            MPIXEL_X4( p_dst_by+2*FDEC_STRIDE ) = pels[2];
-            MPIXEL_X4( p_dst_by+3*FDEC_STRIDE ) = pels[3];
-            h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
+            for( int p = 0; p < plane_count; p++ )
+            {
+                MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
+                MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
+                MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
+                MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
+                h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
+            }
  
              h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
          }
      }
      else if( h->mb.i_type == I_8x8 )
      {
-        ALIGNED_ARRAY_16( pixel, edge,[33] );
+        ALIGNED_ARRAY_16( pixel, edge,[3],[48] );
+        pixel4 pels_h[3][2] = {{0}};
+        pixel pels_v[3][7] = {{0}};
+        uint16_t nnz[3][2] = {{0}}; //shut up gcc
          for( int idx = 0; idx < 4; idx++ )
          {
-            pixel4 pels_h[2] = {0};
-            pixel pels_v[7] = {0};
-            uint16_t i_nnz[2] = {0}; //shut up gcc
-            pixel *p_dst_by;
+            int x = idx&1;
+            int y = idx>>1;
+            int s8 = X264_SCAN8_0 + 2*x + 16*y;
+            pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
+                             h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
+                             h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
              int cbp_luma_new = 0;
              int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
  
              i_best = COST_MAX64;
-            int x = idx&1;
-            int y = idx>>1;
-            int s8 = X264_SCAN8_0 + 2*x + 16*y;
  
-            p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
-            predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
-            h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
+            const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
+            for( int p = 0; p < plane_count; p++ )
+                h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
  
              for( ; *predict_mode >= 0; predict_mode++ )
              {
@@ -1186,12 +1211,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                  if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
                      continue;
  
-                if( h->mb.b_lossless )
-                    x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
-                else
-                    h->predict_8x8[i_mode]( p_dst_by, edge );
                  h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
-                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
+                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
  
                  if( i_best > i_satd )
                  {
@@ -1199,46 +1220,70 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                      cbp_luma_new = h->mb.i_cbp_luma;
                      i_best = i_satd;
  
-                    pels_h[0] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 );
-                    pels_h[1] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 );
-                    if( !(idx&1) )
-                        for( int j = 0; j < 7; j++ )
-                            pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
-                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
-                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
+                    for( int p = 0; p < plane_count; p++ )
+                    {
+                        pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
+                        pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
+                        if( !(idx&1) )
+                            for( int j = 0; j < 7; j++ )
+                                pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
+                        nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
+                        nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
+                    }
                  }
              }
              a->i_cbp_i8x8_luma = cbp_luma_new;
-            MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 ) = pels_h[0];
-            MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 ) = pels_h[1];
-            if( !(idx&1) )
-                for( int j = 0; j < 7; j++ )
-                    p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
-            M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
-            M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
+            for( int p = 0; p < plane_count; p++ )
+            {
+                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
+                MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
+                if( !(idx&1) )
+                    for( int j = 0; j < 7; j++ )
+                        dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
+                M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
+                M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
+            }
  
              x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
          }
      }
  }
  
-#define LOAD_FENC( m, src, xoff, yoff) \
+#define LOAD_FENC(m, src, xoff, yoff) \
+{ \
+    int s = !CHROMA444; \
      (m)->p_cost_mv = a->p_cost_mv; \
      (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
      (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
+    (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
      (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
-    (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
-    (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
+    (m)->p_fenc[1] = &(src)[1][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
+    (m)->p_fenc[2] = &(src)[2][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
+}
  
  #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
+{ \
      (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
-    (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
+    if( CHROMA444 ) \
+    { \
+        (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
+        (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
+        (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
+        (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
+        (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
+        (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
+        (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
+        (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
+    } \
+    else \
+        (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
      (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
      (m)->weight = weight_none; \
-    (m)->i_ref = ref;
+    (m)->i_ref = ref; \
+}
  
  #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
      (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
@@ -1611,18 +1656,29 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
      pixel *pix2 = pix1+8;
      const int i_stride = h->mb.pic.i_stride[1];
      const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride;
-    const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
      const int i_ref = a->l0.me8x8[i8x8].i_ref;
      const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
      x264_weight_t *weight = h->sh.weight[i_ref];
  
      // FIXME weight can be done on 4x4 blocks even if mc is smaller
  #define CHROMA4x4MC( width, height, me, x, y ) \
-    h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
-    if( weight[1].weightfn ) \
-        weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
-    if( weight[2].weightfn ) \
-        weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
+    if( CHROMA444 ) \
+    { \
+        int mvx = (me).mv[0] + 4*2*x; \
+        int mvy = (me).mv[1] + 4*2*y; \
+        h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
+                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
+        h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
+                       mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
+    } \
+    else \
+    { \
+        h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+        if( weight[1].weightfn ) \
+            weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+        if( weight[2].weightfn ) \
+            weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); \
+    }
  
  
      if( size == PIXEL_4x4 )
@@ -1646,8 +1702,10 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
          CHROMA4x4MC( 2,4, m[1], 2,0 );
      }
  
-    return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
-         + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
+    int oe = (8*(i8x8&1) + 4*(i8x8&2)*FENC_STRIDE) >> !CHROMA444;
+    int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+    return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
+         + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
  }
  
  static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
@@ -1763,21 +1821,39 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
  
  static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
  {
-    ALIGNED_ARRAY_16( pixel, pix, [4],[8*8] );
-    ALIGNED_ARRAY_16( pixel,  bi, [2],[8*8] );
+    ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
+    ALIGNED_ARRAY_16( pixel,  bi, [2],[16*16] );
      int l0_mvy_offset, l1_mvy_offset;
      int i_chroma_cost = 0;
  
  #define COST_BI_CHROMA( m0, m1, width, height ) \
  { \
-    l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-    l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
-    h->mc.mc_chroma( pix[0], pix[1], 8, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
-    h->mc.mc_chroma( pix[2], pix[3], 8, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
-    h->mc.avg[i_pixel+3]( bi[0], 8, pix[0], 8, pix[2], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-    h->mc.avg[i_pixel+3]( bi[1], 8, pix[1], 8, pix[3], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
-    i_chroma_cost  = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 8 ); \
-    i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 8 ); \
+    if( CHROMA444 ) \
+    { \
+        h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
+                       m0.mv[0], m0.mv[1], 2*width, 2*height, weight_none ); \
+        h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
+                       m0.mv[0], m0.mv[1], 2*width, 2*height, weight_none ); \
+        h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
+                       m1.mv[0], m1.mv[1], 2*width, 2*height, weight_none ); \
+        h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
+                       m1.mv[0], m1.mv[1], 2*width, 2*height, weight_none ); \
+        h->mc.avg[i_pixel]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+        h->mc.avg[i_pixel]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+        i_chroma_cost  = h->pixf.mbcmp[i_pixel]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
+        i_chroma_cost += h->pixf.mbcmp[i_pixel]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+    } \
+    else \
+    { \
+        l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+        l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+        h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
+        h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
+        h->mc.avg[i_pixel+3]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+        h->mc.avg[i_pixel+3]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+        i_chroma_cost  = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
+        i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+    } \
  }
  
      if( i_pixel == PIXEL_16x16 )
@@ -1799,9 +1875,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
  
      pixel *p_fenc = h->mb.pic.p_fenc[0];
      pixel *p_fdec = h->mb.pic.p_fdec[0];
+    int s = !CHROMA444;
  
      a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
      if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
+    {
+        int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
          for( int i = 0; i < 4; i++ )
          {
              const int x = (i&1)*8;
@@ -1810,23 +1889,25 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
                                                                &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
              if( h->mb.b_chroma_me )
              {
-                a->i_cost8x8direct[i] += h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][(x>>1)+(y>>1)*FENC_STRIDE], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[1][(x>>1)+(y>>1)*FDEC_STRIDE], FDEC_STRIDE )
-                                      +  h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][(x>>1)+(y>>1)*FENC_STRIDE], FENC_STRIDE,
-                                                                   &h->mb.pic.p_fdec[2][(x>>1)+(y>>1)*FDEC_STRIDE], FDEC_STRIDE );
+                a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[1][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE )
+                                      +  h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[2][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE );
              }
              a->i_cost16x16direct += a->i_cost8x8direct[i];
  
              /* mb type cost */
              a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
          }
+    }
      else
      {
+        int chromapix = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
          a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
          if( h->mb.b_chroma_me )
          {
-            a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
-                                 +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
+            a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
+                                 +  h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
          }
      }
  }
@@ -1953,33 +2034,47 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
          if( h->mb.b_chroma_me )
          {
              ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] );
-            ALIGNED_ARRAY_16( pixel, bi, [8*FENC_STRIDE] );
+            ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
  
-            if( MB_INTERLACED & a->l0.bi16x16.i_ref )
+            if( CHROMA444 )
              {
-                int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-                h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
-                                 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
+                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
+                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
+                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
+                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
+                h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
+                                        h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
+                                        h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
+                cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
              }
              else
-                h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
-
-            if( MB_INTERLACED & a->l1.bi16x16.i_ref )
              {
-                int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-                h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
-                                 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
-            }
-            else
-                h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+                if( MB_INTERLACED & a->l0.bi16x16.i_ref )
+                {
+                    int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                    h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
+                                     h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
+                }
+                else
+                    h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+
+                if( MB_INTERLACED & a->l1.bi16x16.i_ref )
+                {
+                    int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                    h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
+                                     h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
+                }
+                else
+                    h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
  
-            h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
-                                  h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
-            h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
-                                  h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
+                h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
+                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
+                h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
+                                      h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  
-            cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
-                   +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
+                cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
+                       +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
+            }
          }
  
          if( cost00 < a->i_cost16x16bi )
@@ -2659,13 +2754,18 @@ static inline void x264_mb_analyse_transform( x264_t *h )
  {
      if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
      {
-        /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
+        /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
          x264_mb_mc( h );
  
-        int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
-                                             h->mb.pic.p_fdec[0], FDEC_STRIDE );
-        int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
-                                             h->mb.pic.p_fdec[0], FDEC_STRIDE );
+        int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
+        int i_cost8 = 0, i_cost4 = 0;
+        for( int p = 0; p < plane_count; p++ )
+        {
+            i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+                                                  h->mb.pic.p_fdec[p], FDEC_STRIDE );
+            i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
+                                                  h->mb.pic.p_fdec[p], FDEC_STRIDE );
+        }
  
          h->mb.b_transform_8x8 = i_cost8 < i_cost4;
          h->mb.b_skip_mc = 1;
@@ -2678,7 +2778,7 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *
      {
          x264_analyse_update_cache( h, a );
          h->mb.b_transform_8x8 ^= 1;
-        /* FIXME only luma is needed, but the score for comparison already includes chroma */
+        /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
          int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
  
          if( *i_rd >= i_rd8 )
@@ -3042,8 +3142,16 @@ intra_analysis:
  
              if( h->mb.b_chroma_me )
              {
-                x264_mb_analyse_intra_chroma( h, &analysis );
-                x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
+                if( CHROMA444 )
+                {
+                    x264_mb_analyse_intra( h, &analysis, i_cost );
+                    x264_mb_analyse_intra_chroma( h, &analysis );
+                }
+                else
+                {
+                    x264_mb_analyse_intra_chroma( h, &analysis );
+                    x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
+                }
                  analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
                  analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
                  analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
@@ -3085,9 +3193,13 @@ intra_analysis:
                   * it was an inter block. */
                  x264_analyse_update_cache( h, &analysis );
                  x264_macroblock_encode( h );
-                h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
-                h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
-                h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+                for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
+                    h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
+                if( !CHROMA444 )
+                {
+                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
+                    h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+                }
                  x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
                  goto intra_analysis;
              }
@@ -3441,8 +3553,16 @@ intra_analysis:
  
              if( h->mb.b_chroma_me )
              {
-                x264_mb_analyse_intra_chroma( h, &analysis );
-                x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
+                if( CHROMA444 )
+                {
+                    x264_mb_analyse_intra( h, &analysis, i_satd_inter );
+                    x264_mb_analyse_intra_chroma( h, &analysis );
+                }
+                else
+                {
+                    x264_mb_analyse_intra_chroma( h, &analysis );
+                    x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
+                }
                  analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
                  analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
                  analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
diff --git a/encoder/cabac.c b/encoder/cabac.c

index c95909587e26c8f271167074e63e01b4bec085f9..491b4ee7382fab3f447c977d5f5ea6f3a0847b07 100644 (file)
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -491,53 +491,63 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
  
  static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra )
  {
+    static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020};
      int i_nza;
      int i_nzb;
  
      switch( i_cat )
      {
+        case DCT_LUMA_8x8:
+        case DCT_CHROMAU_8x8:
+        case DCT_CHROMAV_8x8:
          case DCT_LUMA_AC:
          case DCT_LUMA_4x4:
          case DCT_CHROMA_AC:
-            /* no need to test for skip/pcm */
+        case DCT_CHROMAU_AC:
+        case DCT_CHROMAU_4x4:
+        case DCT_CHROMAV_AC:
+        case DCT_CHROMAV_4x4:
              i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
              i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
              if( x264_constant_p(b_intra) && !b_intra )
-                return 85 + 4*i_cat + ((2*i_nzb + i_nza)&0x7f);
+                return base_ctx[i_cat] + ((2*i_nzb + i_nza)&0x7f);
              else
              {
                  i_nza &= 0x7f + (b_intra << 7);
                  i_nzb &= 0x7f + (b_intra << 7);
-                return 85 + 4*i_cat + 2*!!i_nzb + !!i_nza;
+                return base_ctx[i_cat] + 2*!!i_nzb + !!i_nza;
              }
          case DCT_LUMA_DC:
-            i_nza = (h->mb.cache.i_cbp_left >> 8) & 1;
-            i_nzb = (h->mb.cache.i_cbp_top  >> 8) & 1;
-            return 85 + 4*i_cat + 2*i_nzb + i_nza;
+        case DCT_CHROMAU_DC:
+        case DCT_CHROMAV_DC:
+            i_idx -= LUMA_DC;
+            i_nza = (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1;
+            i_nzb = (h->mb.cache.i_cbp_top  >> (8 + i_idx)) & 1;
+            return base_ctx[i_cat] + 2*i_nzb + i_nza;
          case DCT_CHROMA_DC:
-            /* no need to test skip/pcm */
-            i_idx -= 25;
-            i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (9 + i_idx)) & 1 : b_intra;
-            i_nzb = h->mb.cache.i_cbp_top  != -1 ? (h->mb.cache.i_cbp_top  >> (9 + i_idx)) & 1 : b_intra;
-            return 85 + 4*i_cat + 2*i_nzb + i_nza;
+            i_idx -= LUMA_DC;
+            i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1 : b_intra;
+            i_nzb = h->mb.cache.i_cbp_top  != -1 ? (h->mb.cache.i_cbp_top  >> (8 + i_idx)) & 1 : b_intra;
+            return base_ctx[i_cat] + 2*i_nzb + i_nza;
          default:
              return 0;
      }
  }
  
-
-static const uint16_t significant_coeff_flag_offset[2][6] =
+static const uint16_t significant_coeff_flag_offset[2][14] =
+{
+    { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
+    { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
+};
+static const uint16_t last_coeff_flag_offset[2][14] =
  {
-    { 105, 120, 134, 149, 152, 402 },
-    { 277, 292, 306, 321, 324, 436 }
+    { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748 },
+    { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757 }
  };
-static const uint16_t last_coeff_flag_offset[2][6] =
+static const uint16_t coeff_abs_level_m1_offset[14] =
  {
-    { 166, 181, 195, 210, 213, 417 },
-    { 338, 353, 367, 382, 385, 451 }
+    227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766
  };
-static const uint16_t coeff_abs_level_m1_offset[6] =
-    { 227, 237, 247, 257, 266, 426 };
  static const uint8_t significant_coeff_flag_offset_8x8[2][63] =
  {{
      0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
@@ -570,7 +580,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
  /* update node ctx after coding a level>1 */
      { 4, 4, 4, 4, 5, 6, 7, 7 }
  };
-static const uint8_t count_cat_m1[5] = {15, 14, 15, 3, 14};
+static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
  
  #if !RDO_SKIP_BS
  static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
@@ -585,6 +595,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
      last = h->quantf.coeff_last[ctx_block_cat]( l );
  
  #define WRITE_SIGMAP( l8x8 )\
+{\
      int i = 0;\
      while( 1 )\
      {\
@@ -608,18 +619,14 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
              coeffs[++coeff_idx] = l[i];\
              break;\
          }\
-    }
+    }\
+}
  
-    if( ctx_block_cat == DCT_LUMA_8x8 )
-    {
-        int count_m1 = 63;
+    int count_m1 = count_cat_m1[ctx_block_cat];
+    if( count_m1 == 63 )
          WRITE_SIGMAP( 1 )
-    }
      else
-    {
-        int count_m1 = count_cat_m1[ctx_block_cat];
          WRITE_SIGMAP( 0 )
-    }
  
      do
      {
@@ -651,7 +658,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
          x264_cabac_encode_bypass( cb, coeff_sign );
      } while( --coeff_idx >= 0 );
  }
-#define block_residual_write_cabac_8x8( h, cb, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, l )
+#define block_residual_write_cabac_8x8( h, cb, cat, l ) block_residual_write_cabac( h, cb, cat, l )
  
  #else
  
@@ -738,9 +745,9 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
      }
  }
  
-static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, dctcoef *l )
+static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
  {
-    block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, l, 1 );
+    block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1 );
  }
  static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
  {
@@ -749,6 +756,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
  #endif
  
  #define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+do\
  {\
      int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
      if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
@@ -758,9 +766,22 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo
      }\
      else\
          x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
-}
+} while(0)
  
-void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
+#define block_residual_write_cabac_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+do\
+{\
+    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
+    if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
+    {\
+        x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
+        block_residual_write_cabac_8x8( h, cb, ctx_block_cat, l );\
+    }\
+    else\
+        x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
+} while(0)
+
+static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma )
  {
      const int i_mb_type = h->mb.i_type;
      int i_list;
@@ -782,12 +803,14 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
          bs_t s;
          bs_init( &s, cb->p, cb->p_end - cb->p );
  
-        for( int i = 0; i < 256; i++ )
-            bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
-        for( int ch = 1; ch < 3; ch++ )
-            for( int i = 0; i < 8; i++ )
-                for( int j = 0; j < 8; j++ )
-                    bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
+        for( int p = 0; p < plane_count; p++ )
+            for( int i = 0; i < 256; i++ )
+                bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
+        if( chroma )
+            for( int ch = 1; ch < 3; ch++ )
+                for( int i = 0; i < 8; i++ )
+                    for( int j = 0; j < 8; j++ )
+                        bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
  
          bs_flush( &s );
          cb->p = s.p;
@@ -814,7 +837,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
              }
          }
  
-        x264_cabac_mb_intra_chroma_pred_mode( h, cb );
+        if( chroma )
+            x264_cabac_mb_intra_chroma_pred_mode( h, cb );
      }
      else if( i_mb_type == P_L0 )
      {
@@ -935,7 +959,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
      if( i_mb_type != I_16x16 )
      {
          x264_cabac_mb_cbp_luma( h, cb );
-        x264_cabac_mb_cbp_chroma( h, cb );
+        if( chroma )
+            x264_cabac_mb_cbp_chroma( h, cb );
      }
  
      if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
@@ -943,7 +968,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
          x264_cabac_mb_transform_size( h, cb );
      }
  
-    if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
+    if( h->mb.i_cbp_luma > 0 || (chroma && h->mb.i_cbp_chroma > 0) || i_mb_type == I_16x16 )
      {
          const int b_intra = IS_INTRA( i_mb_type );
          x264_cabac_mb_qp_delta( h, cb );
@@ -952,33 +977,89 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
          if( i_mb_type == I_16x16 )
          {
              /* DC Luma */
-            block_residual_write_cabac_cbf( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 1 );
+            for( int p = 0; p < plane_count; p++ )
+            {
+                block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
  
-            /* AC Luma */
-            if( h->mb.i_cbp_luma != 0 )
-                for( int i = 0; i < 16; i++ )
-                    block_residual_write_cabac_cbf( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 1 );
+                /* AC Luma */
+                if( h->mb.i_cbp_luma )
+                    for( int i = p*16; i < p*16+16; i++ )
+                        block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_AC][p], i, h->dct.luma4x4[i]+1, 1 );
+            }
          }
          else if( h->mb.b_transform_8x8 )
          {
-            for( int i = 0; i < 4; i++ )
-                if( h->mb.i_cbp_luma & ( 1 << i ) )
-                    block_residual_write_cabac_8x8( h, cb, h->dct.luma8x8[i] );
+            if( plane_count == 3 )
+            {
+                ALIGNED_4( uint8_t nnzbak[3][8] );
+
+/* Stupid nnz munging in the case that neighbors don't have
+ * 8x8 transform enabled. */
+#define BACKUP( dst, src, res )\
+    dst = src;\
+    src = res;
+
+#define RESTORE( dst, src, res )\
+    src = dst;
+
+#define MUNGE_8x8_NNZ( MUNGE )\
+if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\
+{\
+    MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\
+    MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\
+    MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\
+    MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\
+    MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\
+    MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\
+}\
+if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\
+{\
+    MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\
+    MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\
+    MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\
+    MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\
+    MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\
+    MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\
+}\
+if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\
+{\
+    MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\
+    MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\
+    MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\
+}
+
+                MUNGE_8x8_NNZ( BACKUP )
+
+                for( int p = 0; p < 3; p++ )
+                    for( int i = 0; i < 4; i++ )
+                        if( h->mb.i_cbp_luma & ( 1 << i ) )
+                            block_residual_write_cabac_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra );
+
+                MUNGE_8x8_NNZ( RESTORE )
+            }
+            else
+            {
+                for( int i = 0; i < 4; i++ )
+                    if( h->mb.i_cbp_luma & ( 1 << i ) )
+                        block_residual_write_cabac_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] );
+            }
          }
          else
          {
-            for( int i = 0; i < 16; i++ )
-                if( h->mb.i_cbp_luma & ( 1 << ( i >> 2 ) ) )
-                    block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], b_intra );
+            for( int p = 0; p < plane_count; p++ )
+                for( int i = 0; i < 16; i++ )
+                    if( h->mb.i_cbp_luma & ( 1 << ( i >> 2 ) ) )
+                        block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+p*16, h->dct.luma4x4[i+p*16], b_intra );
          }
  
-        if( h->mb.i_cbp_chroma ) /* Chroma DC residual present */
+        if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
          {
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra );
-            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra );
+            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
+            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
              if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-                for( int i = 16; i < 24; i++ )
-                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
+                for( int ch = 1; ch < 3; ch++ )
+                    for( int i = ch*16; i < ch*16+4; i++ )
+                        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
          }
      }
  
@@ -987,6 +1068,14 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
  #endif
  }
  
+void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
+{
+    if( CHROMA444 )
+        x264_macroblock_write_cabac_internal( h, cb, 3, 0 );
+    else
+        x264_macroblock_write_cabac_internal( h, cb, 1, 1 );
+}
+
  #if RDO_SKIP_BS
  /*****************************************************************************
   * RD only; doesn't generate a valid bitstream
@@ -999,6 +1088,7 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
  {
      const int i_mb_type = h->mb.i_type;
      int b_8x16 = h->mb.i_partition == D_8x16;
+    int plane_count = CHROMA444 ? 3 : 1;
  
      if( i_mb_type == P_8x8 )
      {
@@ -1025,14 +1115,24 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
          if( h->mb.i_cbp_luma & (1 << i8) )
          {
              if( h->mb.b_transform_8x8 )
-                block_residual_write_cabac_8x8( h, cb, h->dct.luma8x8[i8] );
+            {
+                if( CHROMA444 )
+                    for( int p = 0; p < 3; p++ )
+                        block_residual_write_cabac_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 0 );
+                else
+                    block_residual_write_cabac_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] );
+            }
              else
-                for( int i4 = 0; i4 < 4; i4++ )
-                    block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 0 );
+                for( int p = 0; p < plane_count; p++ )
+                    for( int i4 = 0; i4 < 4; i4++ )
+                        block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16], 0 );
          }
  
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 0 );
+        if( h->mb.i_cbp_chroma )
+        {
+            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
+            block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+        }
  
          i8 += x264_pixel_size[i_pixel].h >> 3;
      }
@@ -1041,15 +1141,16 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
  static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
  {
      int b_8x4 = i_pixel == PIXEL_8x4;
-    block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 0 );
+    int plane_count = CHROMA444 ? 3 : 1;
      if( i_pixel == PIXEL_4x4 )
-    {
          x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
-    }
      else
-    {
          x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
-        block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 0 );
+    for( int p = 0; p < plane_count; p++ )
+    {
+        block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4, h->dct.luma4x4[p*16+i4], 0 );
+        if( i_pixel != PIXEL_4x4 )
+            block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4], 0 );
      }
  }
  
@@ -1060,15 +1161,23 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8,
      x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
      x264_cabac_mb_cbp_luma( h, cb );
      if( h->mb.i_cbp_luma & (1 << i8) )
-        block_residual_write_cabac_8x8( h, cb, h->dct.luma8x8[i8] );
+    {
+        if( CHROMA444 )
+            for( int p = 0; p < 3; p++ )
+                block_residual_write_cabac_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 1 );
+        else
+            block_residual_write_cabac_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] );
+    }
  }
  
  static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
  {
      const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
+    int plane_count = CHROMA444 ? 3 : 1;
      i_mode = x264_mb_pred_mode4x4_fix( i_mode );
      x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
-    block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 1 );
+    for( int p = 0; p < plane_count; p++ )
+        block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 );
  }
  
  static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
@@ -1077,12 +1186,13 @@ static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
      x264_cabac_mb_cbp_chroma( h, cb );
      if( h->mb.i_cbp_chroma > 0 )
      {
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 1 );
-        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 1 );
+        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
+        block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
  
          if( h->mb.i_cbp_chroma == 2 )
-            for( int i = 16; i < 24; i++ )
-                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 );
+            for( int ch = 1; ch < 3; ch++ )
+                for( int i = ch*16; i < ch*16+4; i++ )
+                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 );
      }
  }
  #endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index 5c71b1ab6d855fc44f3b40a8d16722de2adb470a..dcf4e9b4d7cc482d927583d62a3dd0c157929e14 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -32,28 +32,31 @@
  #define RDO_SKIP_BS 0
  #endif
  
-static const uint8_t intra4x4_cbp_to_golomb[48]=
+/* [400,420][inter,intra] */
+static const uint8_t cbp_to_golomb[2][2][48] =
  {
-  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
- 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
- 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
-};
-static const uint8_t inter_cbp_to_golomb[48]=
-{
-  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
-  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
-  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+    {{ 0,  1,  2,  5,  3,  6, 14, 10,  4, 15,  7, 11,  8, 12, 13,  9 },
+     { 1, 10, 11,  6, 12,  7, 14,  2, 13, 15,  8,  3,  9,  4,  5,  0 }},
+    {{ 0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+       1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+       6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 },
+     { 3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+      16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+      41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0 }}
  };
+
  static const uint8_t mb_type_b_to_golomb[3][9]=
  {
      { 4,  8, 12, 10,  6, 14, 16, 18, 20 }, /* D_16x8 */
      { 5,  9, 13, 11,  7, 15, 17, 19, 21 }, /* D_8x16 */
      { 1, -1, -1, -1,  2, -1, -1, -1,  3 }  /* D_16x16 */
  };
+
  static const uint8_t sub_mb_type_p_to_golomb[4]=
  {
      3, 1, 2, 0
  };
+
  static const uint8_t sub_mb_type_b_to_golomb[13]=
  {
      10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
@@ -119,7 +122,7 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc
  {
      bs_t *s = &h->out.bs;
      static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
-    static const uint8_t count_cat[5] = {16, 15, 16, 4, 15};
+    static const uint8_t count_cat[14] = {16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
      x264_run_level_t runlevel;
      int i_trailing, i_total_zero, i_suffix_length;
      int i_total = 0;
@@ -196,7 +199,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
  
  #define block_residual_write_cavlc(h,cat,idx,l)\
  {\
-    int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? 0 : idx )];\
+    int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
      uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
      if( !*nnz )\
          bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
@@ -211,7 +214,9 @@ static void cavlc_qp_delta( x264_t *h )
  
      /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
      if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
-        && !h->mb.cache.non_zero_count[x264_scan8[24]] )
+        && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]]
+        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]]
+        && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] )
      {
  #if !RDO_SKIP_BS
          h->mb.i_qp = h->mb.i_last_qp;
@@ -268,12 +273,12 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, int i8start, int
      {
          /* shuffle 8x8 dct coeffs into 4x4 lists */
          for( int i8 = i8start; i8 <= i8end; i8++ )
-            if( h->mb.i_cbp_luma & (1 << i8) )
+            if( h->mb.cache.non_zero_count[x264_scan8[i8*4]] )
                  h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
      }
  
      for( int i8 = i8start; i8 <= i8end; i8++ )
-        if( h->mb.i_cbp_luma & (1 << i8) )
+        if( h->mb.i_cbp_luma & (1 << (i8&3)) )
              for( int i4 = 0; i4 < 4; i4++ )
                  block_residual_write_cavlc( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
  }
@@ -287,6 +292,8 @@ void x264_macroblock_write_cavlc( x264_t *h )
      const int i_mb_type = h->mb.i_type;
      static const uint8_t i_offsets[3] = {5,23,0};
      int i_mb_i_offset = i_offsets[h->sh.i_type];
+    int plane_count = CHROMA444 ? 3 : 1;
+    int chroma = !CHROMA444;
  
  #if RDO_SKIP_BS
      s->i_bits_encoded = 0;
@@ -311,12 +318,14 @@ void x264_macroblock_write_cavlc( x264_t *h )
  
          bs_align_0( s );
  
-        for( int i = 0; i < 256; i++ )
-            bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
-        for( int ch = 1; ch < 3; ch++ )
-            for( int i = 0; i < 8; i++ )
-                for( int j = 0; j < 8; j++ )
-                    bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
+        for( int p = 0; p < plane_count; p++ )
+            for( int i = 0; i < 256; i++ )
+                bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
+        if( chroma )
+            for( int ch = 1; ch < 3; ch++ )
+                for( int i = 0; i < 8; i++ )
+                    for( int j = 0; j < 8; j++ )
+                        bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
  
          bs_init( s, s->p, s->p_end - s->p );
          s->p_start = p_start;
@@ -348,13 +357,15 @@ void x264_macroblock_write_cavlc( x264_t *h )
              else
                  bs_write( s, 4, i_mode - (i_mode > i_pred) );
          }
-        bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+        if( chroma )
+            bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
      }
      else if( i_mb_type == I_16x16 )
      {
          bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
                          h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
-        bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+        if( chroma )
+            bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
      }
      else if( i_mb_type == P_L0 )
      {
@@ -495,42 +506,43 @@ void x264_macroblock_write_cavlc( x264_t *h )
      h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
  #endif
  
-    /* Coded block patern */
-    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
-        bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
-    else if( i_mb_type != I_16x16 )
-        bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    /* Coded block pattern */
+    if( i_mb_type != I_16x16 )
+        bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
  
      /* transform size 8x8 flag */
      if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
          bs_write1( s, h->mb.b_transform_8x8 );
  
-    /* write residual */
      if( i_mb_type == I_16x16 )
      {
          cavlc_qp_delta( h );
  
          /* DC Luma */
-        block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
+        for( int p = 0; p < plane_count; p++ )
+        {
+            block_residual_write_cavlc( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] );
  
-        /* AC Luma */
-        if( h->mb.i_cbp_luma )
-            for( int i = 0; i < 16; i++ )
-                block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
+            /* AC Luma */
+            if( h->mb.i_cbp_luma )
+                for( int i = p*16; i < p*16+16; i++ )
+                    block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
+        }
      }
      else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
      {
          cavlc_qp_delta( h );
-        x264_macroblock_luma_write_cavlc( h, 0, 3 );
+        x264_macroblock_luma_write_cavlc( h, 0, plane_count*4-1 );
      }
      if( h->mb.i_cbp_chroma )
      {
          /* Chroma DC residual present */
-        block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
-        block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
          if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-            for( int i = 16; i < 24; i++ )
-                block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+            for( int ch = 1; ch < 3; ch++ )
+                for( int i = ch*16; i < ch*16+4; i++ )
+                    block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
      }
  
  #if !RDO_SKIP_BS
@@ -551,6 +563,7 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
      bs_t *s = &h->out.bs;
      const int i_mb_type = h->mb.i_type;
      int b_8x16 = h->mb.i_partition == D_8x16;
+    int plane_count = CHROMA444 ? 3 : 1;
      int j;
  
      if( i_mb_type == P_8x8 )
@@ -575,9 +588,13 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  
      for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
      {
-        x264_macroblock_luma_write_cavlc( h, i8, i8 );
-        block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
-        block_residual_write_cavlc( h, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
+        for( int p = 0; p < plane_count; p++ )
+            x264_macroblock_luma_write_cavlc( h, p*4+i8, p*4+i8 );
+        if( h->mb.i_cbp_chroma )
+        {
+            block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+            block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+        }
          i8 += x264_pixel_size[i_pixel].h >> 3;
      }
  
@@ -586,14 +603,15 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  
  static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
  {
+    int plane_count = CHROMA444 ? 3 : 1;
      int b_8x4 = i_pixel == PIXEL_8x4;
      h->out.bs.i_bits_encoded = 0;
      cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
-    block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
-    if( i_pixel != PIXEL_4x4 )
+    for( int p = 0; p < plane_count; p++ )
      {
-        i4 += 2-b_8x4;
-        block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+        block_residual_write_cavlc( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
+        if( i_pixel != PIXEL_4x4 )
+            block_residual_write_cavlc( h, DCT_LUMA_4x4, p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4] );
      }
  
      return h->out.bs.i_bits_encoded;
@@ -609,16 +627,20 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
  
  static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
  {
+    int plane_count = CHROMA444 ? 3 : 1;
      h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
-    bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
-    x264_macroblock_luma_write_cavlc( h, i8, i8 );
+    bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] );
+    for( int p = 0; p < plane_count; p++ )
+        x264_macroblock_luma_write_cavlc( h, p*4+i8, p*4+i8 );
      return h->out.bs.i_bits_encoded;
  }
  
  static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
  {
+    int plane_count = CHROMA444 ? 3 : 1;
      h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+    for( int p = 0; p < plane_count; p++ )
+        block_residual_write_cavlc( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] );
      return h->out.bs.i_bits_encoded;
  }
  
@@ -627,14 +649,13 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
      h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
      if( h->mb.i_cbp_chroma )
      {
-        block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
-        block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
  
          if( h->mb.i_cbp_chroma == 2 )
-        {
-            for( int i = 16; i < 24; i++ )
-                block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
-        }
+            for( int ch = 1; ch < 3; ch++ )
+                for( int i = ch*16; i < ch*16+4; i++ )
+                    block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
      }
      return h->out.bs.i_bits_encoded;
  }
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 308778b0fb6b5fcf589bf5e92b2ab1296c98cb5c..b988b9a602c32e90dafa751cba5f6f7ecdcb875d 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -69,18 +69,24 @@ static void x264_frame_dump( x264_t *h )
      FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
      if( !f )
          return;
+
      /* Write the frame in display order */
-    fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * sizeof(pixel), SEEK_SET );
-    for( int y = 0; y < h->param.i_height; y++ )
-        fwrite( &h->fdec->plane[0][y*h->fdec->i_stride[0]], sizeof(pixel), h->param.i_width, f );
-    int cw = h->param.i_width>>1;
-    int ch = h->param.i_height>>1;
-    pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
-    pixel *planev = planeu + cw*ch + 16;
-    h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
-    fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
-    fwrite( planev, 1, cw*ch*sizeof(pixel), f );
-    x264_free( planeu );
+    int frame_size = h->param.i_height * h->param.i_width * (3<<CHROMA444)/2 * sizeof(pixel);
+    fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
+    for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
+        for( int y = 0; y < h->param.i_height; y++ )
+            fwrite( &h->fdec->plane[p][y*h->fdec->i_stride[p]], sizeof(pixel), h->param.i_width, f );
+    if( !CHROMA444 )
+    {
+        int cw = h->param.i_width>>1;
+        int ch = h->param.i_height>>1;
+        pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
+        pixel *planev = planeu + cw*ch + 16;
+        h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
+        fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
+        fwrite( planev, 1, cw*ch*sizeof(pixel), f );
+        x264_free( planeu );
+    }
      fclose( f );
  }
  
@@ -401,16 +407,17 @@ static int x264_validate_parameters( x264_t *h, int b_open )
          return -1;
      }
  
-    if( h->param.i_width % 2 || h->param.i_height % 2 )
+    int i_csp = h->param.i_csp & X264_CSP_MASK;
+    if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
      {
-        x264_log( h, X264_LOG_ERROR, "width or height not divisible by 2 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I444/YV24 supported)\n" );
          return -1;
      }
-    int i_csp = h->param.i_csp & X264_CSP_MASK;
-    if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
+
+    if( i_csp < X264_CSP_I444 && (h->param.i_width % 2 || h->param.i_height % 2) )
      {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12 supported)\n" );
+        x264_log( h, X264_LOG_ERROR, "width or height not divisible by 2 (%dx%d)\n",
+                  h->param.i_width, h->param.i_height );
          return -1;
      }
  
@@ -759,6 +766,9 @@ static int x264_validate_parameters( x264_t *h, int b_open )
      h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 );
      h->mb.i_psy_rd = h->param.analyse.i_subpel_refine >= 6 ? FIX8( h->param.analyse.f_psy_rd ) : 0;
      h->mb.i_psy_trellis = h->param.analyse.i_trellis ? FIX8( h->param.analyse.f_psy_trellis / 4 ) : 0;
+    /* In 4:4:4 mode, chroma gets twice as much resolution, so we can halve its quality. */
+    if( b_open && i_csp >= X264_CSP_I444 && h->param.analyse.b_psy )
+        h->param.analyse.i_chroma_qp_offset += 6;
      /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */
      /* so we lower the chroma QP offset to compensate */
      if( b_open && h->mb.i_psy_rd )
@@ -1083,7 +1093,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
      x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
      x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
      if( h->param.b_cabac )
-        x264_cabac_init();
+        x264_cabac_init( h );
      else
          x264_cavlc_init();
      x264_pixel_init( h->param.cpu, &h->pixf );
@@ -1226,10 +1236,11 @@ x264_t *x264_encoder_open( x264_param_t *param )
                            h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
                            h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
                            h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") :
-                          "High 4:4:4 Predictive";
+                          h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive";
      char level[4];
      snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
-    if( h->sps->i_level_idc == 9 || ( h->sps->i_level_idc == 11 && h->sps->b_constraint_set3 ) )
+    if( h->sps->i_level_idc == 9 || ( h->sps->i_level_idc == 11 && h->sps->b_constraint_set3 &&
+        (h->sps->i_profile_idc >= PROFILE_BASELINE && h->sps->i_profile_idc <= PROFILE_EXTENDED) ) )
          strcpy( level, "1b" );
  
      if( h->sps->i_profile_idc < PROFILE_HIGH10 )
@@ -1239,8 +1250,8 @@ x264_t *x264_encoder_open( x264_param_t *param )
      }
      else
      {
-        x264_log( h, X264_LOG_INFO, "profile %s, level %s, bit depth %d\n",
-            profile, level, BIT_DEPTH );
+        x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n",
+            profile, level, CHROMA444 ? "4:4:4" : "4:2:0", BIT_DEPTH );
      }
  
      return h;
@@ -1443,7 +1454,7 @@ int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
  
      /* generate picture parameters */
      x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-    x264_pps_write( &h->out.bs, h->pps );
+    x264_pps_write( &h->out.bs, h->sps, h->pps );
      if( x264_nal_end( h ) )
          return -1;
  
@@ -1536,7 +1547,7 @@ static void x264_weighted_pred_init( x264_t *h )
  {
      /* for now no analysis and set all weights to nothing */
      for( int i_ref = 0; i_ref < h->i_ref[0]; i_ref++ )
-        h->fenc->weighted[i_ref] = h->fref[0][i_ref]->filtered[0];
+        h->fenc->weighted[i_ref] = h->fref[0][i_ref]->filtered[0][0];
  
      // FIXME: This only supports weighting of one reference frame
      // and duplicates of that frame.
@@ -1580,7 +1591,7 @@ static void x264_weighted_pred_init( x264_t *h )
                          //scale full resolution frame
                          if( h->param.i_threads == 1 )
                          {
-                            pixel *src = h->fref[0][j]->filtered[0] - h->fref[0][j]->i_stride[0]*i_padv - PADH;
+                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH;
                              pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
                              int stride = h->fenc->i_stride[0];
                              int width = h->fenc->i_width[0] + PADH*2;
@@ -1758,8 +1769,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
       * but the actual image data is equivalent. For now, maintain this
       * consistency by copying deblocked pixels between planes. */
      if( PARAM_INTERLACED )
-        for( int p = 0; p < 2; p++ )
-            for( int i = minpix_y>>p; i < maxpix_y>>p; i++ )
+        for( int p = 0; p < h->fdec->i_plane; p++ )
+            for( int i = minpix_y>>(!CHROMA444 && p); i < maxpix_y>>(!CHROMA444 && p); i++ )
                  memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
                          h->fdec->plane[p]     + i*h->fdec->i_stride[p],
                          h->mb.i_mb_width*16*sizeof(pixel) );
@@ -1776,7 +1787,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
      }
  
      if( SLICE_MBAFF )
-        for( int i = 0; i < 2; i++ )
+        for( int i = 0; i < 3; i++ )
          {
              XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] );
              XCHG( pixel *, h->intra_border_backup[1][i], h->intra_border_backup[4][i] );
@@ -1790,18 +1801,21 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
          maxpix_y = X264_MIN( maxpix_y, h->param.i_height );
          if( h->param.analyse.b_psnr )
          {
-            uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf,
-                h->fdec->plane[0] + minpix_y * h->fdec->i_stride[0], h->fdec->i_stride[0],
-                h->fenc->plane[0] + minpix_y * h->fenc->i_stride[0], h->fenc->i_stride[0],
-                h->param.i_width, maxpix_y-minpix_y );
-            uint64_t ssd_u, ssd_v;
-            x264_pixel_ssd_nv12( &h->pixf,
-                h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
-                h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
-                h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v );
-            h->stat.frame.i_ssd[0] += ssd_y;
-            h->stat.frame.i_ssd[1] += ssd_u;
-            h->stat.frame.i_ssd[2] += ssd_v;
+            for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
+                h->stat.frame.i_ssd[p] += x264_pixel_ssd_wxh( &h->pixf,
+                    h->fdec->plane[p] + minpix_y * h->fdec->i_stride[p], h->fdec->i_stride[p],
+                    h->fenc->plane[p] + minpix_y * h->fenc->i_stride[p], h->fenc->i_stride[p],
+                    h->param.i_width, maxpix_y-minpix_y );
+            if( !CHROMA444 )
+            {
+                uint64_t ssd_u, ssd_v;
+                x264_pixel_ssd_nv12( &h->pixf,
+                    h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
+                    h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
+                    h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v );
+                h->stat.frame.i_ssd[1] += ssd_u;
+                h->stat.frame.i_ssd[2] += ssd_v;
+            }
          }
  
          if( h->param.analyse.b_ssim )
@@ -1995,7 +2009,7 @@ static int x264_slice_write( x264_t *h )
          bs_align_1( &h->out.bs );
  
          /* init cabac */
-        x264_cabac_context_init( &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc );
+        x264_cabac_context_init( h, &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc );
          x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end );
          last_emu_check = h->cabac.p;
      }
@@ -2207,11 +2221,26 @@ reencode:
          {
              if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
              {
-                int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
-                           + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
-                h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
-                h->stat.frame.i_mb_cbp[!b_intra + 2] += !!h->mb.i_cbp_chroma;
-                h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma >> 1;
+                if( CHROMA444 )
+                {
+                    for( int i = 0; i < 4; i++ )
+                        if( h->mb.i_cbp_luma & (1 << i) )
+                            for( int p = 0; p < 3; p++ )
+                            {
+                                int s8 = i*4+p*16;
+                                int nnz8x8 = M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+0] )
+                                           | M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+8] );
+                                h->stat.frame.i_mb_cbp[!b_intra + p*2] += !!nnz8x8;
+                            }
+                }
+                else
+                {
+                    int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
+                               + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
+                    h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
+                    h->stat.frame.i_mb_cbp[!b_intra + 2] += !!h->mb.i_cbp_chroma;
+                    h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma >> 1;
+                }
              }
              if( h->mb.i_cbp_luma && !b_intra )
              {
@@ -2800,7 +2829,7 @@ int     x264_encoder_encode( x264_t *h,
  
              /* generate picture parameters */
              x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-            x264_pps_write( &h->out.bs, h->pps );
+            x264_pps_write( &h->out.bs, h->sps, h->pps );
              if( x264_nal_end( h ) )
                  return -1;
              overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
@@ -3096,17 +3125,19 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
              h->stat.frame.i_ssd[1],
              h->stat.frame.i_ssd[2],
          };
+        int luma_size = h->param.i_width * h->param.i_height;
+        int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
+        double psnr_y = x264_psnr( ssd[0], luma_size );
+        double psnr_u = x264_psnr( ssd[1], chroma_size );
+        double psnr_v = x264_psnr( ssd[2], chroma_size );
  
          h->stat.f_ssd_global[h->sh.i_type]   += dur * (ssd[0] + ssd[1] + ssd[2]);
-        h->stat.f_psnr_average[h->sh.i_type] += dur * x264_psnr( ssd[0] + ssd[1] + ssd[2], 3 * h->param.i_width * h->param.i_height / 2 );
-        h->stat.f_psnr_mean_y[h->sh.i_type]  += dur * x264_psnr( ssd[0], h->param.i_width * h->param.i_height );
-        h->stat.f_psnr_mean_u[h->sh.i_type]  += dur * x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4 );
-        h->stat.f_psnr_mean_v[h->sh.i_type]  += dur * x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4 );
+        h->stat.f_psnr_average[h->sh.i_type] += dur * x264_psnr( ssd[0] + ssd[1] + ssd[2], luma_size + chroma_size*2 );
+        h->stat.f_psnr_mean_y[h->sh.i_type]  += dur * psnr_y;
+        h->stat.f_psnr_mean_u[h->sh.i_type]  += dur * psnr_u;
+        h->stat.f_psnr_mean_v[h->sh.i_type]  += dur * psnr_v;
  
-        snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f",
-                  x264_psnr( ssd[0], h->param.i_width * h->param.i_height ),
-                  x264_psnr( ssd[1], h->param.i_width * h->param.i_height / 4),
-                  x264_psnr( ssd[2], h->param.i_width * h->param.i_height / 4) );
+        snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", psnr_y, psnr_u, psnr_v );
      }
  
      if( h->param.analyse.b_ssim )
@@ -3186,7 +3217,9 @@ static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_p
   ****************************************************************************/
  void    x264_encoder_close  ( x264_t *h )
  {
-    int64_t i_yuv_size = 3 * h->param.i_width * h->param.i_height / 2;
+    int luma_size = h->param.i_width * h->param.i_height;
+    int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
+    int64_t i_yuv_size = luma_size + chroma_size * 2;
      int64_t i_mb_count_size[2][7] = {{0}};
      char buf[200];
      int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM]
@@ -3376,15 +3409,17 @@ void    x264_encoder_close  ( x264_t *h )
          }
  
          buf[0] = 0;
+        int csize = CHROMA444 ? 4 : 1;
          if( i_mb_count != i_all_intra )
              sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%",
                       h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
-                     h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)  ),
-                     h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
-        x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra: %.1f%% %.1f%% %.1f%%%s\n",
+                     h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)*csize),
+                     h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)*csize) );
+        x264_log( h, X264_LOG_INFO, "coded y,%s,%s intra: %.1f%% %.1f%% %.1f%%%s\n",
+                  CHROMA444?"u":"uvDC", CHROMA444?"v":"uvAC",
                    h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4),
-                  h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra  ),
-                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra  ), buf );
+                  h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra*csize),
+                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra*csize), buf );
  
          int64_t fixed_pred_modes[4][9] = {{0}};
          int64_t sum_pred_modes[4] = {0};
@@ -3423,7 +3458,7 @@ void    x264_encoder_close  ( x264_t *h )
              fixed_pred_modes[3][x264_mb_pred_mode8x8c_fix[i]] += h->stat.i_mb_pred_mode[3][i];
              sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i];
          }
-        if( sum_pred_modes[3] )
+        if( sum_pred_modes[3] && !CHROMA444 )
              x264_log( h, X264_LOG_INFO, "i8c dc,h,v,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
                        fixed_pred_modes[3][0] * 100.0 / sum_pred_modes[3],
                        fixed_pred_modes[3][1] * 100.0 / sum_pred_modes[3],
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 86e3d87fac315dddee2b2a6fbfd108ae7eeee065..d9a601736dd5f8440b8d610855dc77ce78af4b5f 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -81,24 +81,24 @@ static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
      dct4x4[3][0] = 0;
  }
  
-static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
  {
-    int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
-    if( h->mb.b_noise_reduction && ctx_block_cat != DCT_LUMA_AC )
-        h->quantf.denoise_dct( dct, h->nr_residual_sum[0], h->nr_offset[0], 16 );
+    int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
+    if( h->mb.b_noise_reduction )
+        h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
      if( h->mb.b_trellis )
-        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, 0, idx );
+        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
      else
          return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
  }
  
-static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
  {
-    int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
+    int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
      if( h->mb.b_noise_reduction )
-        h->quantf.denoise_dct( dct, h->nr_residual_sum[1], h->nr_offset[1], 64 );
+        h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
      if( h->mb.b_trellis )
-        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
+        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
      else
          return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
  }
@@ -114,92 +114,116 @@ static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, i
  /* This means that decimation can be done merely by adjusting the CBP and NNZ
   * rather than memsetting the coefficients. */
  
-void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
+void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode )
  {
      int nz;
-    pixel *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
-    pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
+    pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
+    pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
      ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
  
+    if( h->mb.b_lossless )
+        x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
+    else
+        h->predict_4x4[i_mode]( p_dst );
+
      if( h->mb.b_lossless )
      {
-        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
-        h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
+        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
          h->mb.i_cbp_luma |= nz<<(idx>>2);
          return;
      }
  
      h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
  
-    nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
-    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+    nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
+    h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
      if( nz )
      {
          h->mb.i_cbp_luma |= 1<<(idx>>2);
-        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
-        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
+        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
+        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
          h->dctf.add4x4_idct( p_dst, dct4x4 );
      }
  }
  
-#define STORE_8x8_NNZ( s8, nz )\
+#define STORE_8x8_NNZ( p, idx, nz )\
  do\
  {\
-    M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
-    M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
  } while(0)
  
-#define CLEAR_16x16_NNZ \
+#define CLEAR_16x16_NNZ( p ) \
+do\
  {\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
-    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
-}
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
+} while(0)
  
-void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
+void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge )
  {
      int x = idx&1;
      int y = idx>>1;
-    int s8 = X264_SCAN8_0 + 2*x + 16*y;
      int nz;
-    pixel *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
-    pixel *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
+    pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
+    pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
      ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+    ALIGNED_ARRAY_16( pixel, edge_buf,[33] );
+
+    if( !edge )
+    {
+        h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
+        edge = edge_buf;
+    }
+
+    if( h->mb.b_lossless )
+        x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
+    else
+        h->predict_8x8[i_mode]( p_dst, edge );
  
      if( h->mb.b_lossless )
      {
-        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
-        STORE_8x8_NNZ( s8, nz );
+        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
+        STORE_8x8_NNZ( p, idx, nz );
          h->mb.i_cbp_luma |= nz<<idx;
          return;
      }
  
      h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
  
-    nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
+    nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
      if( nz )
      {
          h->mb.i_cbp_luma |= 1<<idx;
-        h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
-        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
+        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
+        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
          h->dctf.add8x8_idct8( p_dst, dct8x8 );
-        STORE_8x8_NNZ( s8, 1 );
+        STORE_8x8_NNZ( p, idx, 1 );
      }
      else
-        STORE_8x8_NNZ( s8, 0 );
+        STORE_8x8_NNZ( p, idx, 0 );
  }
  
-static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
+static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
  {
-    pixel *p_src = h->mb.pic.p_fenc[0];
-    pixel *p_dst = h->mb.pic.p_fdec[0];
+    pixel *p_src = h->mb.pic.p_fenc[p];
+    pixel *p_dst = h->mb.pic.p_fdec[p];
  
      ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
      ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
  
-    int nz;
+    int nz, block_cbp = 0;
      int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
+    int i_quant_cat = p ? CQM_4IC : CQM_4IY;
+    int i_mode = h->mb.i_intra16x16_pred_mode;
+
+    if( h->mb.b_lossless )
+        x264_predict_lossless_16x16( h, p, i_mode );
+    else
+        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
  
      if( h->mb.b_lossless )
      {
@@ -207,13 +231,13 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
          {
              int oe = block_idx_xy_fenc[i];
              int od = block_idx_xy_fdec[i];
-            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
-            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
-            h->mb.i_cbp_luma |= nz;
+            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
+            h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
+            block_cbp |= nz;
          }
-        h->mb.i_cbp_luma *= 0xf;
-        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
-        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+        h->mb.i_cbp_luma |= block_cbp * 0xf;
+        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4 );
+        h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
          return;
      }
  
@@ -228,14 +252,17 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
          dct4x4[i][0] = 0;
  
          /* quant/scan/dequant */
-        nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
-        h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+        if( h->mb.b_trellis )
+            nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
+        else
+            nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+        h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
          if( nz )
          {
-            h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
-            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
-            if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[i] );
-            h->mb.i_cbp_luma = 0xf;
+            h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
+            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
+            if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
+            block_cbp = 0xf;
          }
      }
  
@@ -243,31 +270,33 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
      /* More useful with CAVLC, but still useful with CABAC. */
      if( decimate_score < 6 )
      {
-        h->mb.i_cbp_luma = 0;
-        CLEAR_16x16_NNZ
+        CLEAR_16x16_NNZ( p );
+        block_cbp = 0;
      }
+    else
+        h->mb.i_cbp_luma |= block_cbp;
  
      h->dctf.dct4x4dc( dct_dc4x4 );
      if( h->mb.b_trellis )
-        nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 );
+        nz = x264_quant_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, 0, LUMA_DC+p );
      else
-        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
+        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
  
-    h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
+    h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
      if( nz )
      {
-        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+        h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
  
          /* output samples to fdec */
          h->dctf.idct4x4dc( dct_dc4x4 );
-        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
-        if( h->mb.i_cbp_luma )
+        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp );  /* XXX not inversed */
+        if( block_cbp )
              for( int i = 0; i < 16; i++ )
                  dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
      }
  
      /* put pixels to fdec */
-    if( h->mb.i_cbp_luma )
+    if( block_cbp )
          h->dctf.add16x16_idct( p_dst, dct4x4 );
      else if( nz )
          h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
@@ -308,15 +337,12 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
              score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
          if( score < thresh*4 )
          {
-            h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[25]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
  
              for( int ch = 0; ch < 2; ch++ )
              {
@@ -324,7 +350,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                  {
                      h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
                      if( h->mb.b_trellis )
-                        nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
+                        nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
                      else
                          nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
  
@@ -332,7 +358,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                      {
                          if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
                              continue;
-                        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+                        h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
                          zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
                          idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
                          h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
@@ -359,11 +385,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
              {
                  int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
                  int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
-                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
-                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*16], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
+                h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
                  h->mb.i_cbp_chroma |= nz;
              }
-            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
+            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch] );
              continue;
          }
  
@@ -379,36 +405,34 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
                  nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
              else
                  nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
              if( nz )
              {
                  nz_ac = 1;
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*16], dct4x4[i] );
                  h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
                  if( b_decimate )
-                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*16] );
              }
          }
  
          if( h->mb.b_trellis )
-            nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
+            nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
          else
              nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
  
-        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
+        h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
  
          if( (b_decimate && i_decimate_score < 7) || !nz_ac )
          {
              /* Decimate the block */
-            h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[16+0+16*ch]] ) = 0;
+            M16( &h->mb.cache.non_zero_count[x264_scan8[16+2+16*ch]] ) = 0;
              if( !nz_dc ) /* Whole block is empty */
                  continue;
-            if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp  ) )
+            if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
              {
-                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+                h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
                  continue;
              }
              /* DC-only */
@@ -429,61 +453,32 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
      }
  
      /* 0 = none, 1 = DC only, 2 = DC+AC */
-    h->mb.i_cbp_chroma = ((!!M16( &h->mb.cache.non_zero_count[x264_scan8[25]] )) | h->mb.i_cbp_chroma) + h->mb.i_cbp_chroma;
+    h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
+                           h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
  }
  
  static void x264_macroblock_encode_skip( x264_t *h )
  {
-    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
-    M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
-    for( int i = 16; i < 24; i++ )
-        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
+    if( CHROMA444 )
+    {
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
+    }
      h->mb.i_cbp_luma = 0;
      h->mb.i_cbp_chroma = 0;
      h->mb.cbp[h->mb.i_mb_xy] = 0;
  }
  
-/*****************************************************************************
- * x264_macroblock_encode_pskip:
- *  Encode an already marked skip block
- *****************************************************************************/
-static void x264_macroblock_encode_pskip( x264_t *h )
-{
-    /* don't do pskip motion compensation if it was already done in macroblock_analyse */
-    if( !h->mb.b_skip_mc )
-    {
-        int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
-                              h->mb.mv_min[0], h->mb.mv_max[0] );
-        int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
-                              h->mb.mv_min[1], h->mb.mv_max[1] );
-
-        h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
-                       h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                       mvx, mvy, 16, 16, &h->sh.weight[0][0] );
-
-        /* Special case for mv0, which is (of course) very common in P-skip mode. */
-        if( mvx | mvy )
-            h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                             h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                             mvx, mvy, 8, 8 );
-        else
-            h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
-
-        if( h->sh.weight[0][1].weightfn )
-            h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
-                                               h->mb.pic.p_fdec[1], FDEC_STRIDE,
-                                               &h->sh.weight[0][1], 8 );
-        if( h->sh.weight[0][2].weightfn )
-            h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                               h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                                               &h->sh.weight[0][2], 8 );
-    }
-
-    x264_macroblock_encode_skip( h );
-}
-
  /*****************************************************************************
   * Intra prediction for predictive lossless mode.
   *****************************************************************************/
@@ -511,10 +506,10 @@ void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
      }
  }
  
-void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode )
+void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
  {
-    int stride = h->fenc->i_stride[0] << MB_INTERLACED;
-    pixel *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
+    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
+    pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
  
      if( i_mode == I_PRED_4x4_V )
          h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
@@ -524,10 +519,10 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode )
          h->predict_4x4[i_mode]( p_dst );
  }
  
-void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] )
+void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[33] )
  {
-    int stride = h->fenc->i_stride[0] << MB_INTERLACED;
-    pixel *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
+    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
+    pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
  
      if( i_mode == I_PRED_8x8_V )
          h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
@@ -537,35 +532,40 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pi
          h->predict_8x8[i_mode]( p_dst, edge );
  }
  
-void x264_predict_lossless_16x16( x264_t *h, int i_mode )
+void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
  {
-    int stride = h->fenc->i_stride[0] << MB_INTERLACED;
+    int stride = h->fenc->i_stride[p] << MB_INTERLACED;
      if( i_mode == I_PRED_16x16_V )
-        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 );
+        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
      else if( i_mode == I_PRED_16x16_H )
-        h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 );
+        h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
      else
-        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
+        h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
  }
  
  /*****************************************************************************
   * x264_macroblock_encode:
   *****************************************************************************/
-void x264_macroblock_encode( x264_t *h )
+static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
  {
      int i_qp = h->mb.i_qp;
      int b_decimate = h->mb.b_dct_decimate;
      int b_force_no_skip = 0;
      int nz;
      h->mb.i_cbp_luma = 0;
-    h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
+    for( int p = 0; p < plane_count; p++ )
+        h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
  
      if( h->mb.i_type == I_PCM )
      {
          /* if PCM is chosen, we need to store reconstructed frame data */
-        h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
-        h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
-        h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
+        for( int p = 0; p < plane_count; p++ )
+            h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
+        if( chroma )
+        {
+            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
+            h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
+        }
          return;
      }
  
@@ -583,8 +583,41 @@ void x264_macroblock_encode( x264_t *h )
  
      if( h->mb.i_type == P_SKIP )
      {
-        /* A bit special */
-        x264_macroblock_encode_pskip( h );
+        /* don't do pskip motion compensation if it was already done in macroblock_analyse */
+        if( !h->mb.b_skip_mc )
+        {
+            int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
+                                  h->mb.mv_min[0], h->mb.mv_max[0] );
+            int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
+                                  h->mb.mv_min[1], h->mb.mv_max[1] );
+
+            for( int p = 0; p < plane_count; p++ )
+                h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
+                               &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
+                               mvx, mvy, 16, 16, &h->sh.weight[0][p] );
+
+            if( chroma )
+            {
+                /* Special case for mv0, which is (of course) very common in P-skip mode. */
+                if( mvx | mvy )
+                    h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                     h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
+                                     mvx, mvy, 8, 8 );
+                else
+                    h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+
+                if( h->sh.weight[0][1].weightfn )
+                    h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                                       h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                                       &h->sh.weight[0][1], 8 );
+                if( h->sh.weight[0][2].weightfn )
+                    h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                                       h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                                       &h->sh.weight[0][2], 8 );
+            }
+        }
+
+        x264_macroblock_encode_skip( h );
          return;
      }
      if( h->mb.i_type == B_SKIP )
@@ -598,20 +631,16 @@ void x264_macroblock_encode( x264_t *h )
  
      if( h->mb.i_type == I_16x16 )
      {
-        const int i_mode = h->mb.i_intra16x16_pred_mode;
          h->mb.b_transform_8x8 = 0;
  
-        if( h->mb.b_lossless )
-            x264_predict_lossless_16x16( h, i_mode );
-        else
-            h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
-
-        /* encode the 16x16 macroblock */
-        x264_mb_encode_i16x16( h, i_qp );
+        for( int p = 0; p < plane_count; p++ )
+        {
+            x264_mb_encode_i16x16( h, p, i_qp );
+            i_qp = h->mb.i_chroma_qp;
+        }
      }
      else if( h->mb.i_type == I_8x8 )
      {
-        ALIGNED_ARRAY_16( pixel, edge,[33] );
          h->mb.b_transform_8x8 = 1;
          /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
          if( h->mb.i_skip_intra )
@@ -626,18 +655,14 @@ void x264_macroblock_encode( x264_t *h )
              if( h->mb.i_skip_intra == 2 )
                  h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
          }
-        for( int i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
+        for( int p = 0; p < plane_count; p++ )
          {
-            pixel *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
-            int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
-            h->predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
-
-            if( h->mb.b_lossless )
-                x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge );
-            else
-                h->predict_8x8[i_mode]( p_dst, edge );
-
-            x264_mb_encode_i8x8( h, i, i_qp );
+            for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
+            {
+                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
+                x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL );
+            }
+            i_qp = h->mb.i_chroma_qp;
          }
      }
      else if( h->mb.i_type == I_4x4 )
@@ -656,20 +681,20 @@ void x264_macroblock_encode( x264_t *h )
              if( h->mb.i_skip_intra == 2 )
                  h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
          }
-        for( int i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
+        for( int p = 0; p < plane_count; p++ )
          {
-            pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
-            int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+            for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
+            {
+                pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
+                int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
  
-            if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
-                /* emulate missing topright samples */
-                MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
+                if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
+                    /* emulate missing topright samples */
+                    MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
  
-            if( h->mb.b_lossless )
-                x264_predict_lossless_4x4( h, p_dst, i, i_mode );
-            else
-                h->predict_4x4[i_mode]( p_dst );
-            x264_mb_encode_i4x4( h, i, i_qp );
+                x264_mb_encode_i4x4( h, p, i, i_qp, i_mode );
+            }
+            i_qp = h->mb.i_chroma_qp;
          }
      }
      else    /* Inter MB */
@@ -683,167 +708,181 @@ void x264_macroblock_encode( x264_t *h )
          if( h->mb.b_lossless )
          {
              if( h->mb.b_transform_8x8 )
-                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                {
-                    int x = i8x8&1;
-                    int y = i8x8>>1;
-                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
-
-                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
-                                                                   h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
-                    STORE_8x8_NNZ( s8, nz );
-                    h->mb.i_cbp_luma |= nz << i8x8;
-                }
+                for( int p = 0; p < plane_count; p++ )
+                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
+                    {
+                        int x = i8x8&1;
+                        int y = i8x8>>1;
+                        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
+                                                                           h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
+                        STORE_8x8_NNZ( p, i8x8, nz );
+                        h->mb.i_cbp_luma |= nz << i8x8;
+                    }
              else
-                for( int i4x4 = 0; i4x4 < 16; i4x4++ )
-                {
-                    nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
-                                        h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
-                                        h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
-                    h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
-                    h->mb.i_cbp_luma |= nz << (i4x4>>2);
-                }
+                for( int p = 0; p < plane_count; p++ )
+                    for( int i4x4 = 0; i4x4 < 16; i4x4++ )
+                    {
+                        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
+                                                 h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
+                                                 h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
+                        h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
+                        h->mb.i_cbp_luma |= nz << (i4x4>>2);
+                    }
          }
          else if( h->mb.b_transform_8x8 )
          {
              ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
              b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
-            h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
-            h->nr_count[1] += h->mb.b_noise_reduction * 4;
  
-            for( int idx = 0; idx < 4; idx++ )
+            for( int p = 0; p < plane_count; p++ )
              {
-                nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
+                h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
+                h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
  
-                if( nz )
+                int plane_cbp = 0;
+                for( int idx = 0; idx < 4; idx++ )
                  {
-                    h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
-                    if( b_decimate )
+                    nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
+
+                    if( nz )
                      {
-                        int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
-                        i_decimate_mb += i_decimate_8x8;
-                        if( i_decimate_8x8 >= 4 )
-                            h->mb.i_cbp_luma |= 1<<idx;
+                        h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
+                        if( b_decimate )
+                        {
+                            int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
+                            i_decimate_mb += i_decimate_8x8;
+                            if( i_decimate_8x8 >= 4 )
+                                plane_cbp |= 1<<idx;
+                        }
+                        else
+                            plane_cbp |= 1<<idx;
                      }
-                    else
-                        h->mb.i_cbp_luma |= 1<<idx;
                  }
-            }
  
-            if( i_decimate_mb < 6 && b_decimate )
-            {
-                h->mb.i_cbp_luma = 0;
-                CLEAR_16x16_NNZ
-            }
-            else
-            {
-                for( int idx = 0; idx < 4; idx++ )
+                if( i_decimate_mb < 6 && b_decimate )
                  {
-                    int x = idx&1;
-                    int y = idx>>1;
-                    int s8 = X264_SCAN8_0 + 2*x + 16*y;
-
-                    if( h->mb.i_cbp_luma&(1<<idx) )
+                    plane_cbp = 0;
+                    CLEAR_16x16_NNZ( p );
+                }
+                else
+                {
+                    for( int idx = 0; idx < 4; idx++ )
                      {
-                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
-                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
-                        STORE_8x8_NNZ( s8, 1 );
+                        int x = idx&1;
+                        int y = idx>>1;
+
+                        if( plane_cbp&(1<<idx) )
+                        {
+                            h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
+                            h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
+                            STORE_8x8_NNZ( p, idx, 1 );
+                        }
+                        else
+                            STORE_8x8_NNZ( p, idx, 0 );
                      }
-                    else
-                        STORE_8x8_NNZ( s8, 0 );
                  }
+                h->mb.i_cbp_luma |= plane_cbp;
+                i_qp = h->mb.i_chroma_qp;
              }
          }
          else
          {
              ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
-            h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
-            h->nr_count[0] += h->mb.b_noise_reduction * 16;
-
-            for( int i8x8 = 0; i8x8 < 4; i8x8++ )
+            for( int p = 0; p < plane_count; p++ )
              {
-                int i_decimate_8x8 = 0;
-                int cbp = 0;
+                h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
+                h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16;
  
-                /* encode one 4x4 block */
-                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                int plane_cbp = 0;
+                for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                  {
-                    int idx = i8x8 * 4 + i4x4;
+                    int i_decimate_8x8 = 0;
+                    int cbp = 0;
  
-                    nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
-                    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+                    /* encode one 4x4 block */
+                    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                    {
+                        int idx = i8x8 * 4 + i4x4;
+
+                        nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx );
+                        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
+
+                        if( nz )
+                        {
+                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
+                            h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
+                            if( b_decimate && i_decimate_8x8 < 6 )
+                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
+                            cbp = 1;
+                        }
+                    }
  
-                    if( nz )
+                    int x = i8x8&1;
+                    int y = i8x8>>1;
+
+                    /* decimate this 8x8 block */
+                    i_decimate_mb += i_decimate_8x8;
+                    if( b_decimate )
+                    {
+                        if( i_decimate_8x8 < 4 )
+                            STORE_8x8_NNZ( p, i8x8, 0 );
+                        else
+                            plane_cbp |= 1<<i8x8;
+                    }
+                    else if( cbp )
                      {
-                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-                        h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
-                        if( b_decimate && i_decimate_8x8 < 6 )
-                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
-                        cbp = 1;
+                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                        plane_cbp |= 1<<i8x8;
                      }
                  }
  
-                int x = i8x8&1;
-                int y = i8x8>>1;
-
-                /* decimate this 8x8 block */
-                i_decimate_mb += i_decimate_8x8;
                  if( b_decimate )
                  {
-                    if( i_decimate_8x8 < 4 )
+                    if( i_decimate_mb < 6 )
                      {
-                        int s8 = X264_SCAN8_0 + 2*x + 16*y;
-                        STORE_8x8_NNZ( s8, 0 );
+                        plane_cbp = 0;
+                        CLEAR_16x16_NNZ( p );
                      }
                      else
-                        h->mb.i_cbp_luma |= 1<<i8x8;
-                }
-                else if( cbp )
-                {
-                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
-                    h->mb.i_cbp_luma |= 1<<i8x8;
-                }
-            }
-
-            if( b_decimate )
-            {
-                if( i_decimate_mb < 6 )
-                {
-                    h->mb.i_cbp_luma = 0;
-                    CLEAR_16x16_NNZ
-                }
-                else
-                {
-                    for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                        if( h->mb.i_cbp_luma&(1<<i8x8) )
-                            h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                    {
+                        for( int i8x8 = 0; i8x8 < 4; i8x8++ )
+                            if( plane_cbp&(1<<i8x8) )
+                                h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                    }
                  }
+                h->mb.i_cbp_luma |= plane_cbp;
+                i_qp = h->mb.i_chroma_qp;
              }
          }
      }
  
      /* encode chroma */
-    if( IS_INTRA( h->mb.i_type ) )
+    if( chroma )
      {
-        const int i_mode = h->mb.i_chroma_pred_mode;
-        if( h->mb.b_lossless )
-            x264_predict_lossless_8x8_chroma( h, i_mode );
-        else
+        if( IS_INTRA( h->mb.i_type ) )
          {
-            h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
-            h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+            const int i_mode = h->mb.i_chroma_pred_mode;
+            if( h->mb.b_lossless )
+                x264_predict_lossless_8x8_chroma( h, i_mode );
+            else
+            {
+                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+            }
          }
-    }
  
-    /* encode the 8x8 blocks */
-    x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
+        /* encode the 8x8 blocks */
+        x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
+    }
+    else
+        h->mb.i_cbp_chroma = 0;
  
      /* store cbp */
      int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
      if( h->param.b_cabac )
-        cbp |= h->mb.cache.non_zero_count[x264_scan8[24]] << 8
-            |  h->mb.cache.non_zero_count[x264_scan8[25]] << 9
-            |  h->mb.cache.non_zero_count[x264_scan8[26]] << 10;
+        cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
+            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
+            |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
      h->mb.cbp[h->mb.i_mb_xy] = cbp;
  
      /* Check for P_SKIP
@@ -867,11 +906,19 @@ void x264_macroblock_encode( x264_t *h )
      }
  }
  
+void x264_macroblock_encode( x264_t *h )
+{
+    if( CHROMA444 )
+        x264_macroblock_encode_internal( h, 3, 0 );
+    else
+        x264_macroblock_encode_internal( h, 1, 1 );
+}
+
  /*****************************************************************************
   * x264_macroblock_probe_skip:
   *  Check if the current MB could be encoded as a [PB]_SKIP
   *****************************************************************************/
-int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
+static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
  {
      ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
      ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
@@ -881,106 +928,114 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
      int i_qp = h->mb.i_qp;
      int thresh, ssd;
  
-    if( !b_bidir )
+    for( int p = 0; p < plane_count; p++ )
      {
-        /* Get the MV */
-        mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
-        mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
-
-        /* Motion compensation */
-        h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
-                       h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                       mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
-    }
-
-    for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
-    {
-        int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
-        int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
-        /* get luma diff */
-        h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset,
-                                    h->mb.pic.p_fdec[0] + fdec_offset );
-        /* encode one 4x4 block */
-        for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+        int quant_cat = p ? CQM_4PC : CQM_4PY;
+        if( !b_bidir )
          {
-            if( h->mb.b_noise_reduction )
-                h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
-                continue;
-            h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
-            i_decimate_mb += h->quantf.decimate_score16( dctscan );
-            if( i_decimate_mb >= 6 )
-                return 0;
+            /* Get the MV */
+            mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
+            mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
+
+            /* Motion compensation */
+            h->mc.mc_luma( h->mb.pic.p_fdec[p],    FDEC_STRIDE,
+                           &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
+                           mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
          }
-    }
-
-    /* encode chroma */
-    i_qp = h->mb.i_chroma_qp;
-    thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
  
-    if( !b_bidir )
-    {
-        /* Special case for mv0, which is (of course) very common in P-skip mode. */
-        if( M32( mvp ) )
-            h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
-                             h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
-                             mvp[0], mvp[1], 8, 8 );
-        else
-            h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+        for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
+        {
+            int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
+            int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
+            /* get luma diff */
+            h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
+                                        h->mb.pic.p_fdec[p] + fdec_offset );
+            /* encode one 4x4 block */
+            for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+            {
+                if( h->mb.b_noise_reduction )
+                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+                if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) )
+                    continue;
+                h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
+                i_decimate_mb += h->quantf.decimate_score16( dctscan );
+                if( i_decimate_mb >= 6 )
+                    return 0;
+            }
+        }
+        i_qp = h->mb.i_chroma_qp;
      }
  
-    for( int ch = 0; ch < 2; ch++ )
+    if( chroma )
      {
-        pixel *p_src = h->mb.pic.p_fenc[1+ch];
-        pixel *p_dst = h->mb.pic.p_fdec[1+ch];
-
-        if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
-            h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
-                                                  h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
-                                                  &h->sh.weight[0][1+ch], 8 );
+        /* encode chroma */
+        i_qp = h->mb.i_chroma_qp;
+        thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
  
-        /* there is almost never a termination during chroma, but we can't avoid the check entirely */
-        /* so instead we check SSD and skip the actual check if the score is low enough. */
-        ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
-        if( ssd < thresh )
-            continue;
+        if( !b_bidir )
+        {
+            /* Special case for mv0, which is (of course) very common in P-skip mode. */
+            if( M32( mvp ) )
+                h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                 h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
+                                 mvp[0], mvp[1], 8, 8 );
+            else
+                h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+        }
  
-        /* The vast majority of chroma checks will terminate during the DC check or the higher
-         * threshold check, so we can save time by doing a DC-only DCT. */
-        if( h->mb.b_noise_reduction )
+        for( int ch = 0; ch < 2; ch++ )
          {
-            h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
-            for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+            pixel *p_src = h->mb.pic.p_fenc[1+ch];
+            pixel *p_dst = h->mb.pic.p_fdec[1+ch];
+
+            if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
+                h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+                                                      h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+                                                      &h->sh.weight[0][1+ch], 8 );
+
+            /* there is almost never a termination during chroma, but we can't avoid the check entirely */
+            /* so instead we check SSD and skip the actual check if the score is low enough. */
+            ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+            if( ssd < thresh )
+                continue;
+
+            /* The vast majority of chroma checks will terminate during the DC check or the higher
+             * threshold check, so we can save time by doing a DC-only DCT. */
+            if( h->mb.b_noise_reduction )
              {
-                h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                dct2x2[i4x4] = dct4x4[i4x4][0];
+                h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
+                    dct2x2[i4x4] = dct4x4[i4x4][0];
+                }
              }
-        }
-        else
-            h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
+            else
+                h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
  
-        if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
-            return 0;
+            if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
+                return 0;
  
-        /* If there wasn't a termination in DC, we can check against a much higher threshold. */
-        if( ssd < thresh*4 )
-            continue;
+            /* If there wasn't a termination in DC, we can check against a much higher threshold. */
+            if( ssd < thresh*4 )
+                continue;
  
-        if( !h->mb.b_noise_reduction )
-            h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+            if( !h->mb.b_noise_reduction )
+                h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
  
-        /* calculate dct coeffs */
-        for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
-        {
-            dct4x4[i4x4][0] = 0;
-            if( h->mb.b_noise_reduction )
-                h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
-                continue;
-            h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
-            i_decimate_mb += h->quantf.decimate_score15( dctscan );
-            if( i_decimate_mb >= 7 )
-                return 0;
+            /* calculate dct coeffs */
+            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+            {
+                dct4x4[i4x4][0] = 0;
+                if( h->mb.b_noise_reduction )
+                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
+                if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
+                    continue;
+                h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
+                i_decimate_mb += h->quantf.decimate_score15( dctscan );
+                if( i_decimate_mb >= 7 )
+                    return 0;
+            }
          }
      }
  
@@ -988,6 +1043,14 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
      return 1;
  }
  
+int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
+{
+    if( CHROMA444 )
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 3, 0 );
+    else
+        return x264_macroblock_probe_skip_internal( h, b_bidir, 1, 1 );
+}
+
  /****************************************************************************
   * DCT-domain noise reduction / adaptive deadzone
   * from libavcodec
@@ -998,9 +1061,9 @@ void x264_noise_reduction_update( x264_t *h )
      h->nr_offset = h->nr_offset_denoise;
      h->nr_residual_sum = h->nr_residual_sum_buf[0];
      h->nr_count = h->nr_count_buf[0];
-    for( int cat = 0; cat < 3; cat++ )
+    for( int cat = 0; cat < 3 + CHROMA444; cat++ )
      {
-        int dct8x8 = cat == 1;
+        int dct8x8 = cat&1;
          int size = dct8x8 ? 64 : 16;
          const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
  
@@ -1026,161 +1089,210 @@ void x264_noise_reduction_update( x264_t *h )
   * RD only; 4 calls to this do not make up for one macroblock_encode.
   * doesn't transform chroma dc.
   *****************************************************************************/
-void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
+static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
  {
+    int b_decimate = h->mb.b_dct_decimate;
      int i_qp = h->mb.i_qp;
      int x = i8&1;
      int y = i8>>1;
-    int s8 = X264_SCAN8_0 + 2*x + 16*y;
-    pixel *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
-    pixel *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
-    int b_decimate = h->mb.b_dct_decimate;
-    int nnz8x8 = 0;
      int nz;
  
+    h->mb.i_cbp_chroma = 0;
+    h->mb.i_cbp_luma &= ~(1 << i8);
+
      if( !h->mb.b_skip_mc )
          x264_mb_mc_8x8( h, i8 );
  
      if( h->mb.b_lossless )
      {
-        if( h->mb.b_transform_8x8 )
+        for( int p = 0; p < plane_count; p++ )
          {
-            nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
-            STORE_8x8_NNZ( s8, nnz8x8 );
-        }
-        else
-        {
-            for( int i4 = i8*4; i4 < i8*4+4; i4++ )
+            pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
+            pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
+            int nnz8x8 = 0;
+            if( h->mb.b_transform_8x8 )
              {
-                nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
-                                    h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
-                                    h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
-                h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
-                nnz8x8 |= nz;
+                nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
+                STORE_8x8_NNZ( p, i8, nnz8x8 );
              }
+            else
+            {
+                for( int i4 = i8*4; i4 < i8*4+4; i4++ )
+                {
+                    nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
+                                             h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
+                                             h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
+                    h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
+                    nnz8x8 |= nz;
+                }
+            }
+            h->mb.i_cbp_luma |= nnz8x8 << i8;
          }
-        for( int ch = 0; ch < 2; ch++ )
+        if( chroma )
          {
-            dctcoef dc;
-            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
-            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
-            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
+            for( int ch = 0; ch < 2; ch++ )
+            {
+                dctcoef dc;
+                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
+                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
+                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
+            }
+            h->mb.i_cbp_chroma = 0x02;
          }
      }
      else
      {
          if( h->mb.b_transform_8x8 )
          {
-            ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
-            h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-            nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
-            if( nnz8x8 )
+            for( int p = 0; p < plane_count; p++ )
              {
-                h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
-
-                if( b_decimate && !h->mb.b_trellis )
-                    nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
-
+                int quant_cat = p ? CQM_8PC : CQM_8PY;
+                pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
+                ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+                h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
+                int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
                  if( nnz8x8 )
                  {
-                    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
-                    h->dctf.add8x8_idct8( p_fdec, dct8x8 );
-                    STORE_8x8_NNZ( s8, 1 );
+                    h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
+
+                    if( b_decimate && !h->mb.b_trellis )
+                        nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
+
+                    if( nnz8x8 )
+                    {
+                        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
+                        h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+                        STORE_8x8_NNZ( p, i8, 1 );
+                    }
+                    else
+                        STORE_8x8_NNZ( p, i8, 0 );
                  }
                  else
-                    STORE_8x8_NNZ( s8, 0 );
+                    STORE_8x8_NNZ( p, i8, 0 );
+                h->mb.i_cbp_luma |= nnz8x8 << i8;
+                i_qp = h->mb.i_chroma_qp;
              }
-            else
-                STORE_8x8_NNZ( s8, 0 );
          }
          else
          {
-            int i_decimate_8x8 = 0;
-            ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
-            h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
-            for( int i4 = 0; i4 < 4; i4++ )
+            for( int p = 0; p < plane_count; p++ )
              {
-                nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
-                h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
-                if( nz )
+                int quant_cat = p ? CQM_4PC : CQM_4PY;
+                pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
+                int i_decimate_8x8 = 0, nnz8x8 = 0;
+                ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+                h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
+                for( int i4 = 0; i4 < 4; i4++ )
                  {
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
-                    h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
-                    if( b_decimate )
-                        i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
-                    nnz8x8 = 1;
+                    nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, 8*4+i4 );
+                    h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz;
+                    if( nz )
+                    {
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] );
+                        h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp );
+                        if( b_decimate )
+                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] );
+                        nnz8x8 = 1;
+                    }
                  }
-            }
  
-            if( b_decimate && i_decimate_8x8 < 4 )
-                nnz8x8 = 0;
+                if( b_decimate && i_decimate_8x8 < 4 )
+                    nnz8x8 = 0;
  
-            if( nnz8x8 )
-                h->dctf.add8x8_idct( p_fdec, dct4x4 );
-            else
-                STORE_8x8_NNZ( s8, 0 );
-        }
+                if( nnz8x8 )
+                    h->dctf.add8x8_idct( p_fdec, dct4x4 );
+                else
+                    STORE_8x8_NNZ( p, i8, 0 );
  
-        i_qp = h->mb.i_chroma_qp;
+                h->mb.i_cbp_luma |= nnz8x8 << i8;
+                i_qp = h->mb.i_chroma_qp;
+            }
+        }
  
-        for( int ch = 0; ch < 2; ch++ )
+        if( chroma )
          {
-            ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
-            p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
-            p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-            h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-            if( h->mb.b_noise_reduction )
-                h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 );
-            dct4x4[0] = 0;
-
-            if( h->mb.b_trellis )
-                nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
-            else
-                nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-
-            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
-            if( nz )
+            i_qp = h->mb.i_chroma_qp;
+            for( int ch = 0; ch < 2; ch++ )
              {
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
-                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
-                h->dctf.add4x4_idct( p_fdec, dct4x4 );
+                ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+                pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
+                pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
+                h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
+                if( h->mb.b_noise_reduction )
+                    h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 );
+                dct4x4[0] = 0;
+
+                if( h->mb.b_trellis )
+                    nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
+                else
+                    nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+
+                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
+                if( nz )
+                {
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*16], dct4x4 );
+                    h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
+                    h->dctf.add4x4_idct( p_fdec, dct4x4 );
+                }
              }
+            h->mb.i_cbp_chroma = 0x02;
          }
      }
-    h->mb.i_cbp_luma &= ~(1 << i8);
-    h->mb.i_cbp_luma |= nnz8x8 << i8;
-    h->mb.i_cbp_chroma = 0x02;
+}
+
+void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
+{
+    if( CHROMA444 )
+        x264_macroblock_encode_p8x8_internal( h, i8, 3, 0 );
+    else
+        x264_macroblock_encode_p8x8_internal( h, i8, 1, 1 );
  }
  
  /*****************************************************************************
- * RD only, luma only
+ * RD only, luma only (for 4:2:0)
   *****************************************************************************/
-void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
+static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
  {
      int i_qp = h->mb.i_qp;
-    pixel *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
-    pixel *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
-    int nz;
  
-    /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
-
-    if( h->mb.b_lossless )
+    for( int p = 0; p < plane_count; p++ )
      {
-        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
-        h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
-    }
-    else
-    {
-        ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
-        h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-        nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
-        h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
-        if( nz )
+        int quant_cat = p ? CQM_4PC : CQM_4PY;
+        pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
+        pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
+        int nz;
+
+        /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
+
+        if( h->mb.b_lossless )
          {
-            h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
-            h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
-            h->dctf.add4x4_idct( p_fdec, dct4x4 );
+            nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
+            h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
          }
+        else
+        {
+            ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+            h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
+            nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
+            h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
+            if( nz )
+            {
+                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
+                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
+                h->dctf.add4x4_idct( p_fdec, dct4x4 );
+            }
+        }
+        i_qp = h->mb.i_chroma_qp;
      }
  }
+
+void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
+{
+    if( CHROMA444 )
+        x264_macroblock_encode_p4x4_internal( h, i8, 3 );
+    else
+        x264_macroblock_encode_p4x4_internal( h, i8, 1 );
+}
diff --git a/encoder/macroblock.h b/encoder/macroblock.h

index 4b9091cd5a8abeaad93c2996d9c297a39caebeca..3da0e0d2d42ec333cb15b6bd5cd5ad5bacc5bff2 100644 (file)
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -42,9 +42,9 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
      x264_macroblock_probe_skip( h, 1 )
  
  void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
-void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode );
-void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] );
-void x264_predict_lossless_16x16( x264_t *h, int i_mode );
+void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
+void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[33] );
+void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
  
  void x264_macroblock_encode      ( x264_t *h );
  void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
@@ -52,18 +52,18 @@ void x264_macroblock_write_cavlc ( x264_t *h );
  
  void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
  void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
-void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp );
-void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp );
+void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode );
+void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge );
  void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
  
  void x264_cabac_mb_skip( x264_t *h, int b_skip );
  
  int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma );
+                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
  int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                               int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
  int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                             int i_qp, int b_intra, int idx );
+                             int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
  
  void x264_noise_reduction_update( x264_t *h );
  
diff --git a/encoder/me.c b/encoder/me.c

index 10e6cd823c8b09755616777510227836d3291c20..230a555b8f510dfe221bfff29ff272552f3caf25 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -785,22 +785,37 @@ void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh
  if( b_refine_qpel || (dir^1) != odir ) \
  { \
      int stride = 16; \
-    pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+    pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
      int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
               + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
      if( b_chroma_me && cost < bcost ) \
      { \
-        h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
-        if( m->weight[1].weightfn ) \
-            m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
-                                                                  &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
-        cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
-        if( cost < bcost ) \
+        if( CHROMA444 ) \
          { \
-            if( m->weight[2].weightfn ) \
-                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
-                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
-            cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+            stride = 16; \
+            src = h->mc.get_ref( pix, &stride, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
+            cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[1], FENC_STRIDE, src, stride ); \
+            if( cost < bcost ) \
+            { \
+                stride = 16; \
+                src = h->mc.get_ref( pix, &stride, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
+                cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[2], FENC_STRIDE, src, stride ); \
+            } \
+        } \
+        else \
+        { \
+            h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+            if( m->weight[1].weightfn ) \
+                m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
+                                                                      &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
+            cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
+            if( cost < bcost ) \
+            { \
+                if( m->weight[2].weightfn ) \
+                    m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
+                                                                          &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
+                cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+            } \
          } \
      } \
      COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
@@ -813,7 +828,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
      const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
      const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
      const int i_pixel = m->i_pixel;
-    const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
+    const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
      const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  
      ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
@@ -920,10 +935,24 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
      int i = 4 + 3*dx + dy;\
      int mvx = bm##list##x+dx;\
      int mvy = bm##list##y+dy;\
-    stride[list][i] = bw;\
-    src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
+    stride[0][list][i] = bw;\
+    src[0][list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[0][list][i], &m->p_fref[0],\
+                                     m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
      if( rd )\
-        h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+    {\
+        if( CHROMA444 )\
+        {\
+            stride[1][list][i] = bw;\
+            src[1][list][i] = h->mc.get_ref( pixu_buf[list][i], &stride[1][list][i], &m->p_fref[4],\
+                                             m->i_stride[1], mvx, mvy, bw, bh, weight_none );\
+            stride[2][list][i] = bw;\
+            src[2][list][i] = h->mc.get_ref( pixv_buf[list][i], &stride[2][list][i], &m->p_fref[8],\
+                                             m->i_stride[2], mvx, mvy, bw, bh, weight_none );\
+        }\
+        else\
+            h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
+                             mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+    }\
  }
  
  #define SATD_THRESH 17/16
@@ -943,17 +972,18 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
      const int bw = x264_pixel_size[i_pixel].w;
      const int bh = x264_pixel_size[i_pixel].h;
      ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
-    ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][8*8] );
-    ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][8*8] );
-    pixel *src[2][9];
+    ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
+    ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
+    pixel *src[3][2][9];
+    int chromasize = CHROMA444 ? 8 : 4;
      pixel *pix  = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
-    pixel *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
-    pixel *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
+    pixel *pixu = &h->mb.pic.p_fdec[1][chromasize*x + chromasize*y*FDEC_STRIDE];
+    pixel *pixv = &h->mb.pic.p_fdec[2][chromasize*x + chromasize*y*FDEC_STRIDE];
      int ref0 = h->mb.cache.ref[0][s8];
      int ref1 = h->mb.cache.ref[1][s8];
      const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
      const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    int stride[2][9];
+    int stride[3][2][9];
      int bm0x = m0->mv[0];
      int bm0y = m0->mv[1];
      int bm1x = m1->mv[0];
@@ -1023,7 +1053,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
                  int i0 = 4 + 3*dia4d[j][0] + dia4d[j][1];
                  int i1 = 4 + 3*dia4d[j][2] + dia4d[j][3];
                  visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
-                h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][i0], stride[0][i0], src[1][i1], stride[1][i1], i_weight );
+                h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][0][i0], stride[0][0][i0], src[0][1][i1], stride[0][1][i1], i_weight );
                  int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
                           + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
                  if( rd )
@@ -1033,8 +1063,16 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
                          bcost = X264_MIN( cost, bcost );
                          M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
                          M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
-                        h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
-                        h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+                        if( CHROMA444 )
+                        {
+                            h->mc.avg[i_pixel]( pixu, FDEC_STRIDE, src[1][0][i0], stride[1][0][i0], src[1][1][i1], stride[1][1][i1], i_weight );
+                            h->mc.avg[i_pixel]( pixv, FDEC_STRIDE, src[2][0][i0], stride[2][0][i0], src[2][1][i1], stride[2][1][i1], i_weight );
+                        }
+                        else
+                        {
+                            h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+                            h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+                        }
                          uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
                          COPY2_IF_LT( bcostrd, costrd, bestj, j );
                      }
@@ -1107,7 +1145,12 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
      { \
          uint64_t cost; \
          M32( cache_mv ) = pack16to32_mask(mx,my); \
-        if( m->i_pixel <= PIXEL_8x8 ) \
+        if( CHROMA444 ) \
+        { \
+            h->mc.mc_luma( pixu, FDEC_STRIDE, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
+            h->mc.mc_luma( pixv, FDEC_STRIDE, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
+        } \
+        else if( m->i_pixel <= PIXEL_8x8 ) \
          { \
              h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
              if( m->weight[1].weightfn ) \
@@ -1141,8 +1184,17 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
      uint16_t amvd;
  
      pixel *pix  = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
-    pixel *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
-    pixel *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+    pixel *pixu, *pixv;
+    if( CHROMA444 )
+    {
+        pixu = &h->mb.pic.p_fdec[1][block_idx_xy_fdec[i4]];
+        pixv = &h->mb.pic.p_fdec[2][block_idx_xy_fdec[i4]];
+    }
+    else
+    {
+        pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+        pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+    }
  
      h->mb.b_skip_mc = 1;
  
diff --git a/encoder/me.h b/encoder/me.h

index 32816304f4d352d1b3c11dabc181241ba6d3ecf5..8b5baa01373fec649f4c2456257fc90f2e5aa24f 100644 (file)
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -41,11 +41,11 @@ typedef struct
      int      i_ref;
      const x264_weight_t *weight;
  
-    pixel *p_fref[6];
+    pixel *p_fref[12];
      pixel *p_fref_w;
      pixel *p_fenc[3];
      uint16_t *integral;
-    int      i_stride[2];
+    int      i_stride[3];
  
      ALIGNED_4( int16_t mvp[2] );
  
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index 9be06452d937e149440a144ab8653c083d5c9154..0c3085d5ddd9d418ce5c8af93f397ba90d0c22f1 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -217,15 +217,15 @@ static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_f
      return ssd - ((uint64_t)sum * sum >> shift);
  }
  
-static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int field, int b_store )
+static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
  {
-    int w = i ? 8 : 16;
+    int w = b_chroma ? 8 : 16;
      int stride = frame->i_stride[i];
-    int offset = field
+    int offset = b_field
          ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride
          : 16 * mb_x + w * mb_y * stride;
-    stride <<= field;
-    if( i )
+    stride <<= b_field;
+    if( b_chroma )
      {
          ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
          h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
@@ -233,7 +233,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
               + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store );
      }
      else
-        return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, 0, b_store );
+        return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store );
  }
  
  // Find the total AC energy of the block in all planes.
@@ -249,16 +249,32 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
          /* We don't know the super-MB mode we're going to pick yet, so
           * simply try both and pick the lower of the two. */
          uint32_t var_interlaced, var_progressive;
-        var_interlaced   = ac_energy_plane( h, mb_x, mb_y, frame, 0, 1, 1 );
-        var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1 );
-        var_progressive  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0 );
-        var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0 );
+        var_interlaced   = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 1, 1 );
+        var_progressive  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0, 0 );
+        if( CHROMA444 )
+        {
+            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 1, 1 );
+            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0, 0 );
+            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 1, 1 );
+            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 0, 0 );
+        }
+        else
+        {
+            var_interlaced  += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1, 1 );
+            var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 0, 0 );
+        }
          var = X264_MIN( var_interlaced, var_progressive );
      }
      else
      {
-        var  = ac_energy_plane( h, mb_x, mb_y, frame, 0, PARAM_INTERLACED, 1 );
-        var += ac_energy_plane( h, mb_x, mb_y, frame, 1, PARAM_INTERLACED, 1 );
+        var  = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, PARAM_INTERLACED, 1 );
+        if( CHROMA444 )
+        {
+            var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, PARAM_INTERLACED, 1 );
+            var += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, PARAM_INTERLACED, 1 );
+        }
+        else
+            var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, PARAM_INTERLACED, 1 );
      }
      x264_emms();
      return var;
@@ -363,8 +379,9 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
      {
          uint64_t ssd = frame->i_pixel_ssd[i];
          uint64_t sum = frame->i_pixel_sum[i];
-        int width = h->mb.i_mb_width*16>>!!i;
-        int height = h->mb.i_mb_height*16>>!!i;
+        int size = CHROMA444 || !i ? 16 : 8;
+        int width = h->mb.i_mb_width*size;
+        int height = h->mb.i_mb_height*size;
          frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
      }
  }
@@ -1234,8 +1251,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
          if( h->param.b_bluray_compat )
              mincr = 4;
  
-        /* High 10 doesn't require minCR, so just set the maximum to a large value. */
-        if( h->sps->i_profile_idc == PROFILE_HIGH10 )
+        /* High 10 / High 4:4:4 Predictive doesn't require minCR, so just set the maximum to a large value. */
+        if( h->sps->i_profile_idc >= PROFILE_HIGH10 )
              rc->frame_size_maximum = 1e9;
          else
          {
diff --git a/encoder/rdo.c b/encoder/rdo.c

index 6e7becb8d167f74c62befc1d979527425fbad8ba..7ceb2d60275dc96f2180cd3eb6f3283089db6531 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -64,7 +64,7 @@ static uint16_t cabac_size_5ones[128];
  #include "cabac.c"
  
  #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
-        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
+        sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
  #define COPY_CABAC_PART( pos, size )\
          memcpy( &cb->state[pos], &h->cabac.state[pos], size )
  
@@ -146,9 +146,10 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
  
  static inline int ssd_mb( x264_t *h )
  {
-    int chromassd = ssd_plane(h, PIXEL_8x8, 1, 0, 0) + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
-    chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-    return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chromassd;
+    int chroma_size = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
+    int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0);
+    chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+    return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd;
  }
  
  static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
@@ -188,29 +189,6 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
      return i_ssd + i_bits;
  }
  
-/* For small partitions (i.e. those using at most one DCT category's worth of CABAC states),
- * it's faster to copy the individual parts than to perform a whole CABAC_COPY. */
-static ALWAYS_INLINE void x264_copy_cabac_part( x264_t *h, x264_cabac_t *cb, int cat, int intra )
-{
-    if( intra )
-        COPY_CABAC_PART( 68, 2 );  //intra pred mode
-    else
-        COPY_CABAC_PART( 40, 16 ); //mvd, rounded up to 16 bytes
-
-    /* 8x8dct writes CBP, while non-8x8dct writes CBF */
-    if( cat != DCT_LUMA_8x8 )
-        COPY_CABAC_PART( 85 + cat * 4, 4 );
-    else
-        COPY_CABAC_PART( 73, 4 );
-
-    /* Really should be 15 bytes, but rounding up a byte saves some
-     * instructions and is faster, and copying extra data doesn't hurt. */
-    COPY_CABAC_PART( significant_coeff_flag_offset[MB_INTERLACED][cat], 16 );
-    COPY_CABAC_PART( last_coeff_flag_offset[MB_INTERLACED][cat], 16 );
-    COPY_CABAC_PART( coeff_abs_level_m1_offset[cat], 10 );
-    cb->f8_bits_encoded = 0;
-}
-
  /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
  
  static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
@@ -224,11 +202,18 @@ static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pi
          x264_macroblock_encode_p4x4( h, i4+2 );
  
      i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
+    if( CHROMA444 )
+    {
+        int chromassd = ssd_plane( h, i_pixel, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 )
+                      + ssd_plane( h, i_pixel, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 );
+        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+        i_ssd += chromassd;
+    }
  
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        x264_copy_cabac_part( h, &cabac_tmp, DCT_LUMA_4x4, 0 );
+        COPY_CABAC;
          x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
@@ -261,10 +246,19 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
      if( i_pixel == PIXEL_8x16 )
          x264_macroblock_encode_p8x8( h, i8+2 );
  
-    chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
-              + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+    i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 );
+    if( CHROMA444 )
+    {
+        chromassd = ssd_plane( h, i_pixel, 1, (i8&1)*8, (i8>>1)*8 )
+                  + ssd_plane( h, i_pixel, 2, (i8&1)*8, (i8>>1)*8 );
+    }
+    else
+    {
+        chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
+                  + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+    }
      chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
-    i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 ) + chromassd;
+    i_ssd += chromassd;
  
      if( h->param.b_cabac )
      {
@@ -279,19 +273,33 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
      return (i_ssd<<8) + i_bits;
  }
  
-static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
+static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[3][48] )
  {
      uint64_t i_ssd, i_bits;
+    int plane_count = CHROMA444 ? 3 : 1;
+    int i_qp = h->mb.i_qp;
      h->mb.i_cbp_luma &= ~(1<<i8);
      h->mb.b_transform_8x8 = 1;
  
-    x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
+    for( int p = 0; p < plane_count; p++ )
+    {
+        x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p] );
+        i_qp = h->mb.i_chroma_qp;
+    }
+
      i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
+    if( CHROMA444 )
+    {
+        int chromassd = ssd_plane( h, PIXEL_8x8, 1, (i8&1)*8, (i8>>1)*8 )
+                      + ssd_plane( h, PIXEL_8x8, 2, (i8&1)*8, (i8>>1)*8 );
+        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+        i_ssd += chromassd;
+    }
  
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        x264_copy_cabac_part( h, &cabac_tmp, DCT_LUMA_8x8, 1 );
+        COPY_CABAC;
          x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
@@ -304,14 +312,28 @@ static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode
  static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
  {
      uint64_t i_ssd, i_bits;
+    int plane_count = CHROMA444 ? 3 : 1;
+    int i_qp = h->mb.i_qp;
+
+    for( int p = 0; p < plane_count; p++ )
+    {
+        x264_mb_encode_i4x4( h, p, i4, i_qp, i_mode );
+        i_qp = h->mb.i_chroma_qp;
+    }
  
-    x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
      i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
+    if( CHROMA444 )
+    {
+        int chromassd = ssd_plane( h, PIXEL_4x4, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 )
+                      + ssd_plane( h, PIXEL_4x4, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 );
+        chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+        i_ssd += chromassd;
+    }
  
      if( h->param.b_cabac )
      {
          x264_cabac_t cabac_tmp;
-        x264_copy_cabac_part( h, &cabac_tmp, DCT_LUMA_4x4, 1 );
+        COPY_CABAC;
          x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
          i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
      }
@@ -419,7 +441,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                           const udctcoef *quant_mf, const int *unquant_mf,
                           const int *coef_weight, const uint8_t *zigzag,
                           int ctx_block_cat, int i_lambda2, int b_ac,
-                         int dc, int i_coefs, int idx )
+                         int b_chroma, int dc, int i_coefs, int idx )
  {
      int abs_coefs[64], signs[64];
      trellis_node_t nodes[2][8];
@@ -458,6 +480,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
      }
  
      i_last_nnz = i;
+    idx &= i_coefs == 64 ? 3 : 15;
  
      for( ; i >= b_ac; i-- )
      {
@@ -546,7 +569,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
              int d = i_coef - unquant_abs_level;
              int64_t ssd;
              /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
-            if( h->mb.i_psy_trellis && i && !dc && ctx_block_cat != DCT_CHROMA_AC )
+            if( h->mb.i_psy_trellis && i && !dc && !b_chroma )
              {
                  int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];
                  int predicted_coef = orig_coef - i_coef * signs[i];
@@ -664,7 +687,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
                           const udctcoef *quant_mf, const int *unquant_mf,
                           const int *coef_weight, const uint8_t *zigzag,
                           int ctx_block_cat, int i_lambda2, int b_ac,
-                         int dc, int i_coefs, int idx, int b_8x8 )
+                         int b_chroma, int dc, int i_coefs, int idx, int b_8x8 )
  {
      ALIGNED_16( dctcoef quant_coefs[2][16] );
      ALIGNED_16( dctcoef coefs[16] ) = {0};
@@ -672,7 +695,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
      int64_t score = 1ULL<<62;
      int i, j;
      const int f = 1<<15;
-    int nC = ctx_block_cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, ctx_block_cat == DCT_LUMA_DC ? 0 : idx )];
+    int nC = ctx_block_cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, ctx_block_cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];
  
      /* Code for handling 8x8dct -> 4x4dct CAVLC munging.  Input/output use a different
       * step/start/end than internal processing. */
@@ -685,6 +708,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
          end = 60 + start;
          step = 4;
      }
+    idx &= 15;
  
      i_lambda2 <<= LAMBDA_BITS;
  
@@ -726,7 +750,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
              delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight[j]);
  
              /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
-            if( h->mb.i_psy_trellis && j && !dc && ctx_block_cat != DCT_CHROMA_AC )
+            if( h->mb.i_psy_trellis && j && !dc && !b_chroma )
              {
                  int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]];
                  int predicted_coef = orig_coef - coef;
@@ -836,40 +860,44 @@ zeroblock:
  const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
  
  int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                           int i_qp, int ctx_block_cat, int b_intra, int b_chroma )
+                           int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
  {
      if( h->param.b_cabac )
          return quant_trellis_cabac( h, dct,
              h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
              NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, 0 );
+            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx );
+
+    if( ctx_block_cat != DCT_CHROMA_DC )
+        ctx_block_cat = DCT_LUMA_DC;
  
      return quant_trellis_cavlc( h, dct,
          h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
          NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
-        ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, 0, 0 );
+        ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx, 0 );
  }
  
  int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
                              int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
  {
-    int b_ac = (ctx_block_cat == DCT_LUMA_AC || ctx_block_cat == DCT_CHROMA_AC);
+    static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};
+    int b_ac = ctx_ac[ctx_block_cat];
      if( h->param.b_cabac )
          return quant_trellis_cabac( h, dct,
              h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
              x264_dct4_weight2_zigzag[MB_INTERLACED],
              x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx );
+            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx );
  
      return quant_trellis_cavlc( h, dct,
              h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
              x264_dct4_weight2_zigzag[MB_INTERLACED],
              x264_zigzag_scan4[MB_INTERLACED],
-            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx, 0 );
+            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx, 0 );
  }
  
  int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
-                            int i_qp, int b_intra, int idx )
+                            int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
  {
      if( h->param.b_cabac )
      {
@@ -877,7 +905,7 @@ int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
              h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
              x264_dct8_weight2_zigzag[MB_INTERLACED],
              x264_zigzag_scan8[MB_INTERLACED],
-            DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx );
+            ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 64, idx );
      }
  
      /* 8x8 CAVLC is split into 4 4x4 blocks */
@@ -888,7 +916,7 @@ int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
              h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
              x264_dct8_weight2_zigzag[MB_INTERLACED],
              x264_zigzag_scan8[MB_INTERLACED],
-            DCT_LUMA_4x4, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 16, idx*4+i, 1 );
+            DCT_LUMA_4x4, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 16, idx*4+i, 1 );
          /* Set up nonzero count for future calls */
          h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
          nzaccum |= nz;
diff --git a/encoder/set.c b/encoder/set.c

index d3a5f606a98aa22fdbb6dc388212c16e26ba832b..0044e584fbd1ec218c540f636a4036c083003826 100644 (file)
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -46,6 +46,8 @@ static void scaling_list_write( bs_t *s, x264_pps_t *pps, int idx )
      const uint8_t *list = pps->scaling_list[idx];
      const uint8_t *def_list = (idx==CQM_4IC) ? pps->scaling_list[CQM_4IY]
                              : (idx==CQM_4PC) ? pps->scaling_list[CQM_4PY]
+                            : (idx==CQM_8IC+4) ? pps->scaling_list[CQM_8IY+4]
+                            : (idx==CQM_8PC+4) ? pps->scaling_list[CQM_8PY+4]
                              : x264_cqm_jvt[idx];
      if( !memcmp( list, def_list, len ) )
          bs_write1( s, 0 );   // scaling_list_present_flag
@@ -100,9 +102,10 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
      sps->i_id = i_id;
      sps->i_mb_width = ( param->i_width + 15 ) / 16;
      sps->i_mb_height= ( param->i_height + 15 ) / 16;
+    sps->i_chroma_format_idc = param->i_csp >= X264_CSP_I444 ? 3 : 1;
  
      sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
-    if( sps->b_qpprime_y_zero_transform_bypass )
+    if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == 3 )
          sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
      else if( BIT_DEPTH > 8 )
          sps->i_profile_idc  = PROFILE_HIGH10;
@@ -130,6 +133,9 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
      /* High 10 Intra profile */
      if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH10 )
          sps->b_constraint_set3 = 1;
+    /* High 4:4:4 Intra profile */
+    if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH444_PREDICTIVE )
+        sps->b_constraint_set3 = 1;
  
      sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
      /* extra slot with pyramid so that we don't have to override the
@@ -267,7 +273,9 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
  
      if( sps->i_profile_idc >= PROFILE_HIGH )
      {
-        bs_write_ue( s, 1 ); // chroma_format_idc = 4:2:0
+        bs_write_ue( s, sps->i_chroma_format_idc );
+        if( sps->i_chroma_format_idc == 3 )
+            bs_write1( s, 0 ); // separate_colour_plane_flag
          bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
          bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
          bs_write1( s, sps->b_qpprime_y_zero_transform_bypass );
@@ -290,10 +298,11 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
      bs_write1( s, sps->b_crop );
      if( sps->b_crop )
      {
-        bs_write_ue( s, sps->crop.i_left   / 2 );
-        bs_write_ue( s, sps->crop.i_right  / 2 );
-        bs_write_ue( s, sps->crop.i_top    / 2 );
-        bs_write_ue( s, sps->crop.i_bottom / 2 );
+        int cropshift = sps->i_chroma_format_idc != 3;
+        bs_write_ue( s, sps->crop.i_left   >> cropshift );
+        bs_write_ue( s, sps->crop.i_right  >> cropshift );
+        bs_write_ue( s, sps->crop.i_top    >> cropshift );
+        bs_write_ue( s, sps->crop.i_bottom >> cropshift );
      }
  
      bs_write1( s, sps->b_vui );
@@ -427,31 +436,36 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
      pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
  
      pps->i_cqm_preset = param->i_cqm_preset;
+
      switch( pps->i_cqm_preset )
      {
      case X264_CQM_FLAT:
-        for( int i = 0; i < 6; i++ )
+        for( int i = 0; i < 8; i++ )
              pps->scaling_list[i] = x264_cqm_flat16;
          break;
      case X264_CQM_JVT:
-        for( int i = 0; i < 6; i++ )
+        for( int i = 0; i < 8; i++ )
              pps->scaling_list[i] = x264_cqm_jvt[i];
          break;
      case X264_CQM_CUSTOM:
          /* match the transposed DCT & zigzag */
          transpose( param->cqm_4iy, 4 );
-        transpose( param->cqm_4ic, 4 );
          transpose( param->cqm_4py, 4 );
+        transpose( param->cqm_4ic, 4 );
          transpose( param->cqm_4pc, 4 );
          transpose( param->cqm_8iy, 8 );
          transpose( param->cqm_8py, 8 );
+        transpose( param->cqm_8ic, 8 );
+        transpose( param->cqm_8pc, 8 );
          pps->scaling_list[CQM_4IY] = param->cqm_4iy;
-        pps->scaling_list[CQM_4IC] = param->cqm_4ic;
          pps->scaling_list[CQM_4PY] = param->cqm_4py;
+        pps->scaling_list[CQM_4IC] = param->cqm_4ic;
          pps->scaling_list[CQM_4PC] = param->cqm_4pc;
          pps->scaling_list[CQM_8IY+4] = param->cqm_8iy;
          pps->scaling_list[CQM_8PY+4] = param->cqm_8py;
-        for( int i = 0; i < 6; i++ )
+        pps->scaling_list[CQM_8IC+4] = param->cqm_8ic;
+        pps->scaling_list[CQM_8PC+4] = param->cqm_8pc;
+        for( int i = 0; i < 8; i++ )
              for( int j = 0; j < (i < 4 ? 16 : 64); j++ )
                  if( pps->scaling_list[i][j] == 0 )
                      pps->scaling_list[i] = x264_cqm_jvt[i];
@@ -459,7 +473,7 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
      }
  }
  
-void x264_pps_write( bs_t *s, x264_pps_t *pps )
+void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps )
  {
      bs_realign( s );
      bs_write_ue( s, pps->i_id );
@@ -496,8 +510,20 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps )
              bs_write1( s, 0 ); // Cr = Cb
              if( pps->b_transform_8x8_mode )
              {
-                scaling_list_write( s, pps, CQM_8IY+4 );
-                scaling_list_write( s, pps, CQM_8PY+4 );
+                if( sps->i_chroma_format_idc == 3 )
+                {
+                    scaling_list_write( s, pps, CQM_8IY+4 );
+                    scaling_list_write( s, pps, CQM_8IC+4 );
+                    bs_write1( s, 0 ); // Cr = Cb
+                    scaling_list_write( s, pps, CQM_8PY+4 );
+                    scaling_list_write( s, pps, CQM_8PC+4 );
+                    bs_write1( s, 0 ); // Cr = Cb
+                }
+                else
+                {
+                    scaling_list_write( s, pps, CQM_8IY+4 );
+                    scaling_list_write( s, pps, CQM_8PY+4 );
+                }
              }
          }
          bs_write_se( s, pps->i_chroma_qp_index_offset );
@@ -726,7 +752,8 @@ int x264_validate_levels( x264_t *h, int verbose )
      int ret = 0;
      int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
      int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
-    int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
+    int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH444_PREDICTIVE ? 16 :
+                     h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
                       h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
  
      const x264_level_t *l = x264_levels;
diff --git a/encoder/set.h b/encoder/set.h

index 4f1eeabaac3b84d222f7b574c59402f573dc1ad5..5bd289f85f0f8bb1fcf5f275bf6e8f0b2438064d 100644 (file)
--- a/encoder/set.h
+++ b/encoder/set.h
@@ -30,7 +30,7 @@
  void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
  void x264_sps_write( bs_t *s, x264_sps_t *sps );
  void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
-void x264_pps_write( bs_t *s, x264_pps_t *pps );
+void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps );
  void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
  int  x264_sei_version_write( x264_t *h, bs_t *s );
  int  x264_validate_levels( x264_t *h, int verbose );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index 7cfd90b9f84cdd61967f3a2d3028250151f02767..88186610b4658fb06ce8ba46b55f810e92c86870 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -132,6 +132,33 @@ static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc
      x264_emms();
  }
  
+static NOINLINE pixel *x264_weight_cost_init_chroma444( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dst, int p )
+{
+    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
+    int i_stride = fenc->i_stride[p];
+    int i_lines = fenc->i_lines[p];
+    int i_width = fenc->i_width[p];
+
+    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
+    {
+        for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 16, pel_offset_y = y*i_stride )
+            for( int x = 0, pel_offset_x = 0; x < i_width; x += 16, mb_xy++, pel_offset_x += 16 )
+            {
+                pixel *pix = dst + pel_offset_y + pel_offset_x;
+                pixel *src = fenc->plane[p] + pel_offset_y + pel_offset_x;
+                int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0] / 2;
+                int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1] / 2;
+                /* We don't want to calculate hpels for fenc frames, so we round the motion
+                 * vectors to fullpel here.  It's not too bad, I guess? */
+                h->mc.copy_16x16_unaligned( pix, i_stride, src+mvx+mvy*i_stride, i_stride, 16 );
+            }
+        x264_emms();
+        return dst;
+    }
+    x264_emms();
+    return fenc->plane[p];
+}
+
  static int x264_weight_slice_header_cost( x264_t *h, x264_weight_t *w, int b_chroma )
  {
      /* Add cost of weights in the slice header. */
@@ -224,6 +251,33 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
      return cost;
  }
  
+static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w, int p )
+{
+    unsigned int cost = 0;
+    int i_stride = fenc->i_stride[p];
+    int i_lines = fenc->i_lines[p];
+    int i_width = fenc->i_width[p];
+    pixel *src = fenc->plane[p];
+    ALIGNED_ARRAY_16( pixel, buf, [16*16] );
+    int pixoff = 0;
+    if( w )
+    {
+        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
+            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
+            {
+                w->weightfn[16>>2]( buf, 16, &ref[pixoff], i_stride, w, 16 );
+                cost += h->pixf.mbcmp[PIXEL_16x16]( buf, 16, &src[pixoff], i_stride );
+            }
+        cost += x264_weight_slice_header_cost( h, w, 1 );
+    }
+    else
+        for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride )
+            for( int x = 0; x < i_width; x += 16, pixoff += 16 )
+                cost += h->pixf.mbcmp[PIXEL_16x16]( &ref[pixoff], 16, &src[pixoff], i_stride );
+    x264_emms();
+    return cost;
+}
+
  void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
  {
      int i_delta_index = fenc->i_frame - ref->i_frame - 1;
@@ -285,12 +339,17 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
          }
          else
          {
-            pixel *dstu = h->mb.p_weight_buf[0];
-            pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
-            /* Only initialize chroma data once. */
-            if( plane == 1 )
-                x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
-            mcbuf = plane == 1 ? dstu : dstv;
+            if( CHROMA444 )
+                mcbuf = x264_weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane );
+            else
+            {
+                pixel *dstu = h->mb.p_weight_buf[0];
+                pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
+                /* Only initialize chroma data once. */
+                if( plane == 1 )
+                    x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
+                mcbuf = plane == 1 ? dstu : dstv;
+            }
              origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
          }
  
@@ -308,7 +367,12 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
              SET_WEIGHT( weights[plane], 1, minscale, mindenom, i_off );
              unsigned int s;
              if( plane )
-                s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+            {
+                if( CHROMA444 )
+                    s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
+                else
+                    s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+            }
              else
                  s = x264_weight_cost_luma( h, fenc, mcbuf, &weights[plane] );
              COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
diff --git a/filters/video/depth.c b/filters/video/depth.c

index ac99e28d4def569425ca271cf8e9d5533e023ce1..25dde257f89de189c995efd56648ad5f21346834 100644 (file)
--- a/filters/video/depth.c
+++ b/filters/video/depth.c
@@ -46,6 +46,7 @@ static int depth_filter_csp_is_supported( int csp )
      return csp_mask == X264_CSP_I420 ||
             csp_mask == X264_CSP_I422 ||
             csp_mask == X264_CSP_I444 ||
+           csp_mask == X264_CSP_YV24 ||
             csp_mask == X264_CSP_YV12 ||
             csp_mask == X264_CSP_NV12;
  }
diff --git a/filters/video/resize.c b/filters/video/resize.c

index dad886da686cb3d9596e7ef178c6ac1cf2fae262..9a4ccc804fff9d86062cd8ddaf54417c39461df7 100644 (file)
--- a/filters/video/resize.c
+++ b/filters/video/resize.c
@@ -137,6 +137,7 @@ static int convert_csp_to_pix_fmt( int csp )
          case X264_CSP_YV12: /* specially handled via swapping chroma */
          case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV420P16 : PIX_FMT_YUV420P;
          case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV422P16 : PIX_FMT_YUV422P;
+        case X264_CSP_YV24: /* specially handled via swapping chroma */
          case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P;
          case X264_CSP_RGB:  return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_RGB48     : PIX_FMT_RGB24;
          /* the next 3 csps have no equivalent 16bit depth in swscale */
@@ -436,9 +437,11 @@ static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x2
      h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
      h->scale = h->dst;
  
-    /* swap chroma planes if YV12 is involved, as libswscale works with I420 */
-    h->pre_swap_chroma = (info->csp & X264_CSP_MASK) == X264_CSP_YV12;
-    h->post_swap_chroma = (h->dst_csp & X264_CSP_MASK) == X264_CSP_YV12;
+    /* swap chroma planes if YV12/YV24 is involved, as libswscale works with I420/I444 */
+    int src_csp = info->csp & X264_CSP_MASK;
+    int dst_csp = h->dst_csp & X264_CSP_MASK;
+    h->pre_swap_chroma  = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV24;
+    h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV24;
  
      int src_pix_fmt = convert_csp_to_pix_fmt( info->csp );
  
diff --git a/input/avs.c b/input/avs.c

index a8731a6e4b239d7a7488602094fb399f22c6a162..f3f1118d0df931895b8c92b5857bfe9bec1c01ca 100644 (file)
--- a/input/avs.c
+++ b/input/avs.c
@@ -121,6 +121,17 @@ static AVS_Value update_clip( avs_hnd_t *h, const AVS_VideoInfo **vi, AVS_Value
      return res;
  }
  
+static float get_avs_version( avs_hnd_t *h )
+{
+    FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" )
+    AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL );
+    FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) )
+    FAIL_IF_ERROR( !avs_is_float( ver ), "VersionNumber did not return a float value\n" );
+    float ret = avs_as_float( ver );
+    h->func.avs_release_value( ver );
+    return ret;
+}
+
  static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
  {
      FILE *fh = fopen( psz_filename, "r" );
@@ -139,6 +150,10 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
          const char *error = h->func.avs_get_error( h->env );
          FAIL_IF_ERROR( error, "%s\n", error );
      }
+    float avs_version = get_avs_version( h );
+    if( avs_version <= 0 )
+        return -1;
+    x264_cli_log( "avs", X264_LOG_DEBUG, "using avisynth version %.2f\n", avs_version );
      AVS_Value arg = avs_new_value_string( psz_filename );
      AVS_Value res;
      char *filename_ext = get_filename_extension( psz_filename );
@@ -203,15 +218,21 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
          info->tff = avs_is_tff( vi );
      }
  #if !HAVE_SWSCALE
-    /* if swscale is not available, convert CSPs to yv12 */
-    if( !avs_is_yv12( vi ) )
+    /* if swscale is not available, convert the CSP if necessary */
+    if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) )
      {
-        x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to YV12\n" );
-        FAIL_IF_ERROR( vi->width&1 || vi->height&1, "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+        FAIL_IF_ERROR( avs_version < 2.6f && opt->output_csp == X264_CSP_I444, "avisynth >= 2.6 is required for i444 output\n" )
+
+        const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : "YV24";
+        x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp );
+        FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && (vi->width&1 || vi->height&1),
+                       "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
          const char *arg_name[2] = { NULL, "interlaced" };
          AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
-        AVS_Value res2 = h->func.avs_invoke( h->env, "ConvertToYV12", avs_new_value_array( arg_arr, 2 ), arg_name );
-        FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to YV12\n" )
+        char conv_func[14] = { "ConvertTo" };
+        strcat( conv_func, csp );
+        AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, 2 ), arg_name );
+        FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s\n", csp )
          res = update_clip( h, &vi, res2, res );
      }
  #endif
@@ -243,7 +264,12 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
      else
          info->csp = X264_CSP_NONE;
  #else
-    info->csp = X264_CSP_I420;
+    if( avs_is_yv24( vi ) )
+        info->csp = X264_CSP_I444;
+    else if( avs_is_yv12( vi ) )
+        info->csp = X264_CSP_I420;
+    else
+        info->csp = X264_CSP_NONE;
  #endif
      info->vfr = 0;
  
diff --git a/input/input.c b/input/input.c

index 8bcf1f4080ae8dfcd12d929fd50d499080b89052..084499ae24494e0a381e019df3593ef15af68fce 100644 (file)
--- a/input/input.c
+++ b/input/input.c
@@ -29,6 +29,7 @@ const x264_cli_csp_t x264_cli_csps[] = {
      [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
      [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1,  1,  1 }, 2, 1 },
      [X264_CSP_I444] = { "i444", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
+    [X264_CSP_YV24] = { "yv24", 3, { 1,  1,  1 }, { 1,  1,  1 }, 1, 1 },
      [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
      [X264_CSP_NV12] = { "nv12", 2, { 1,  1 },     { 1, .5 },     2, 2 },
      [X264_CSP_BGR]  = { "bgr",  1, { 3 },         { 1 },         1, 1 },
diff --git a/input/input.h b/input/input.h

index 1a94e0cc52362821c25a4b49b4c360ae6922bf3b..488833d0a1f0c1545a0278070080fef3bef170d9 100644 (file)
--- a/input/input.h
+++ b/input/input.h
@@ -41,6 +41,7 @@ typedef struct
      char *timebase;
      int seek;
      int progress;
+    int output_csp; /* convert to this csp, if applicable */
  } cli_input_opt_t;
  
  /* properties of the source given by the demuxer */
@@ -103,11 +104,10 @@ extern cli_input_t input;
  
  /* extended colorspace list that isn't supported by libx264 but by the cli */
  #define X264_CSP_I422           X264_CSP_MAX     /* yuv 4:2:2 planar    */
-#define X264_CSP_I444          (X264_CSP_MAX+1)  /* yuv 4:4:4 planar    */
-#define X264_CSP_BGR           (X264_CSP_MAX+2)  /* packed bgr 24bits   */
-#define X264_CSP_BGRA          (X264_CSP_MAX+3)  /* packed bgr 32bits   */
-#define X264_CSP_RGB           (X264_CSP_MAX+4)  /* packed rgb 24bits   */
-#define X264_CSP_CLI_MAX       (X264_CSP_MAX+5)  /* end of list         */
+#define X264_CSP_BGR           (X264_CSP_MAX+1)  /* packed bgr 24bits   */
+#define X264_CSP_BGRA          (X264_CSP_MAX+2)  /* packed bgr 32bits   */
+#define X264_CSP_RGB           (X264_CSP_MAX+3)  /* packed rgb 24bits   */
+#define X264_CSP_CLI_MAX       (X264_CSP_MAX+4)  /* end of list         */
  #define X264_CSP_OTHER          0x4000           /* non x264 colorspace */
  
  typedef struct
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 00001bde6e191299ef815270a14eb814365acac7..ce737ecb55f1a5d6e8066297ed75b5b32c2eb19b 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -566,6 +566,7 @@ static int check_dct( int cpu_ref, int cpu_new )
  
      memset( h, 0, sizeof(*h) );
      x264_param_default( &h->param );
+    h->sps->i_chroma_format_idc = 1;
      h->chroma_qp_table = i_chroma_qp_table + 12;
      h->param.analyse.i_luma_deadzone[0] = 0;
      h->param.analyse.i_luma_deadzone[1] = 0;
@@ -1435,6 +1436,7 @@ static int check_quant( int cpu_ref, int cpu_new )
      x264_t h_buf;
      x264_t *h = &h_buf;
      memset( h, 0, sizeof(*h) );
+    h->sps->i_chroma_format_idc = 1;
      x264_param_default( &h->param );
      h->chroma_qp_table = i_chroma_qp_table + 12;
      h->param.analyse.b_transform_8x8 = 1;
@@ -1904,26 +1906,26 @@ static int check_intra( int cpu_ref, int cpu_new )
  }
  
  #define DECL_CABAC(cpu) \
-static void run_cabac_decision_##cpu( uint8_t *dst )\
+static void run_cabac_decision_##cpu( x264_t *h, uint8_t *dst )\
  {\
      x264_cabac_t cb;\
-    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
      x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
      for( int i = 0; i < 0x1000; i++ )\
          x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
  }\
-static void run_cabac_bypass_##cpu( uint8_t *dst )\
+static void run_cabac_bypass_##cpu( x264_t *h, uint8_t *dst )\
  {\
      x264_cabac_t cb;\
-    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
      x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
      for( int i = 0; i < 0x1000; i++ )\
          x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
  }\
-static void run_cabac_terminal_##cpu( uint8_t *dst )\
+static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
  {\
      x264_cabac_t cb;\
-    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
+    x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
      x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
      for( int i = 0; i < 0x1000; i++ )\
          x264_cabac_encode_terminal_##cpu( &cb );\
@@ -1940,28 +1942,30 @@ DECL_CABAC(asm)
  static int check_cabac( int cpu_ref, int cpu_new )
  {
      int ret = 0, ok, used_asm = 1;
+    x264_t h;
+    h.sps->i_chroma_format_idc = 3;
      if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
          return 0;
-    x264_cabac_init();
+    x264_cabac_init( &h );
  
      set_func_name( "cabac_encode_decision" );
      memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_decision_c, buf3 );
-    call_a( run_cabac_decision_asm, buf4 );
+    call_c( run_cabac_decision_c, &h, buf3 );
+    call_a( run_cabac_decision_asm, &h, buf4 );
      ok = !memcmp( buf3, buf4, 0x1000 );
      report( "cabac decision:" );
  
      set_func_name( "cabac_encode_bypass" );
      memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_bypass_c, buf3 );
-    call_a( run_cabac_bypass_asm, buf4 );
+    call_c( run_cabac_bypass_c, &h, buf3 );
+    call_a( run_cabac_bypass_asm, &h, buf4 );
      ok = !memcmp( buf3, buf4, 0x1000 );
      report( "cabac bypass:" );
  
      set_func_name( "cabac_encode_terminal" );
      memcpy( buf4, buf3, 0x1000 );
-    call_c( run_cabac_terminal_c, buf3 );
-    call_a( run_cabac_terminal_asm, buf4 );
+    call_c( run_cabac_terminal_c, &h, buf3 );
+    call_a( run_cabac_terminal_asm, &h, buf4 );
      ok = !memcmp( buf3, buf4, 0x1000 );
      report( "cabac terminal:" );
  
diff --git a/x264.c b/x264.c

index 4d693d485ca9f801a8bbcec2abd4719da9ebe07f..ea7c5cdfd8bd7e97f311ad0953674e8574e6b8a0 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -121,6 +121,7 @@ static const char * const muxer_names[] =
  
  static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
  static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
+static const char * const output_csp_names[] = { "i420", "i444", 0 };
  
  typedef struct
  {
@@ -729,6 +730,8 @@ static void help( x264_param_t *defaults, int longhelp )
      H1( "      --input-fmt <string>    Specify input file format (requires lavf support)\n" );
      H1( "      --input-csp <string>    Specify input colorspace format for raw input\n" );
      print_csp_names( longhelp );
+    H1( "      --output-csp <string>   Specify output colorspace [\"%s\"]\n"
+        "                                  - %s\n", output_csp_names[0], stringify_names( buf, output_csp_names ) );
      H1( "      --input-depth <integer> Specify input bit depth for raw input\n" );
      H1( "      --input-res <intxint>   Specify input resolution (width x height)\n" );
      H1( "      --index <string>        Filename for input index file\n" );
@@ -808,7 +811,8 @@ enum
      OPT_INPUT_RES,
      OPT_INPUT_CSP,
      OPT_INPUT_DEPTH,
-    OPT_DTS_COMPRESSION
+    OPT_DTS_COMPRESSION,
+    OPT_OUTPUT_CSP
  } OptionsOPT;
  
  static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
@@ -966,6 +970,7 @@ static struct option long_options[] =
      { "input-csp",   required_argument, NULL, OPT_INPUT_CSP },
      { "input-depth", required_argument, NULL, OPT_INPUT_DEPTH },
      { "dts-compress",      no_argument, NULL, OPT_DTS_COMPRESSION },
+    { "output-csp",  required_argument, NULL, OPT_OUTPUT_CSP },
      {0, 0, 0, 0}
  };
  
@@ -1085,7 +1090,7 @@ static int select_input( const char *demuxer, char *used_demuxer, char *filename
      return 0;
  }
  
-static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info, x264_param_t *param )
+static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info, x264_param_t *param, int output_csp )
  {
      x264_register_vid_filters();
  
@@ -1117,14 +1122,15 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
          param->i_height = info->height;
          param->i_width  = info->width;
      }
-    /* if the current csp is supported by libx264, have libx264 use this csp.
-     * otherwise change the csp to I420 and have libx264 use this.
-     * when more colorspaces are supported, this decision will need to be updated. */
+    /* force the output csp to what the user specified (or the default) */
+    param->i_csp = info->csp;
      int csp = info->csp & X264_CSP_MASK;
-    if( csp > X264_CSP_NONE && csp < X264_CSP_MAX )
-        param->i_csp = info->csp;
-    else
-        param->i_csp = X264_CSP_I420 | ( info->csp & X264_CSP_HIGH_DEPTH );
+    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
+        param->i_csp = X264_CSP_I420;
+    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
+        param->i_csp = X264_CSP_I444;
+    param->i_csp |= info->csp & X264_CSP_HIGH_DEPTH;
+
      if( x264_init_vid_filter( "resize", handle, &filter, info, param, NULL ) )
          return -1;
  
@@ -1185,6 +1191,7 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
      memset( &input_opt, 0, sizeof(cli_input_opt_t) );
      memset( &output_opt, 0, sizeof(cli_output_opt_t) );
      input_opt.bit_depth = 8;
+    int output_csp = defaults.i_csp;
      opt->b_progress = 1;
  
      /* Presets are applied before all other options. */
@@ -1238,7 +1245,7 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
                  param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
                  break;
              case OPT_SEEK:
-                opt->i_seek = input_opt.seek = X264_MAX( atoi( optarg ), 0 );
+                opt->i_seek = X264_MAX( atoi( optarg ), 0 );
                  break;
              case 'o':
                  output_filename = optarg;
@@ -1339,6 +1346,11 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
              case OPT_DTS_COMPRESSION:
                  output_opt.use_dts_compress = 1;
                  break;
+            case OPT_OUTPUT_CSP:
+                FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg )
+                // correct the parsed value to the libx264 csp value
+                output_csp = !output_csp ? X264_CSP_I420 : X264_CSP_I444;
+                break;
              default:
  generic_option:
              {
@@ -1399,7 +1411,9 @@ generic_option:
      info.tff        = param->b_tff;
      info.vfr        = param->b_vfr_input;
  
+    input_opt.seek = opt->i_seek;
      input_opt.progress = opt->b_progress;
+    input_opt.output_csp = output_csp;
  
      if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) )
          return -1;
@@ -1476,7 +1490,7 @@ generic_option:
          info.tff = param->b_tff;
      }
  
-    if( init_vid_filters( vid_filters, &opt->hin, &info, param ) )
+    if( init_vid_filters( vid_filters, &opt->hin, &info, param, output_csp ) )
          return -1;
  
      /* set param flags from the post-filtered video */
diff --git a/x264.h b/x264.h

index 0dbfe57213719ee1b253aafc36674d49424acb27..f936443d20904d46f95596b6f0966d895d737406 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
  
  #include "x264_config.h"
  
-#define X264_BUILD 115
+#define X264_BUILD 116
  
  /* x264_t:
   *      opaque handler for encoder */
@@ -180,7 +180,9 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
  #define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
  #define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
  #define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_MAX            0x0004  /* end of list */
+#define X264_CSP_I444           0x0004  /* yuv 4:4:4 planar */
+#define X264_CSP_YV24           0x0005  /* yvu 4:4:4 planar */
+#define X264_CSP_MAX            0x0006  /* end of list */
  #define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
  #define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
  
@@ -293,11 +295,13 @@ typedef struct x264_param_t
      int         i_cqm_preset;
      char        *psz_cqm_file;      /* JM format */
      uint8_t     cqm_4iy[16];        /* used only if i_cqm_preset == X264_CQM_CUSTOM */
-    uint8_t     cqm_4ic[16];
      uint8_t     cqm_4py[16];
+    uint8_t     cqm_4ic[16];
      uint8_t     cqm_4pc[16];
      uint8_t     cqm_8iy[64];
      uint8_t     cqm_8py[64];
+    uint8_t     cqm_8ic[64];
+    uint8_t     cqm_8pc[64];
  
      /* Log */
      void        (*pf_log)( void *, int i_level, const char *psz, va_list );
author	Fiona Glaser <fiona@x264.com>
	Wed, 22 Jun 2011 10:32:53 +0000 (03:32 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Sun, 10 Jul 2011 04:15:52 +0000 (21:15 -0700)
common/cabac.c		patch \| blob \| history
common/cabac.h		patch \| blob \| history
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/deblock.c		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/quant.h		patch \| blob \| history
common/set.c		patch \| blob \| history
common/set.h		patch \| blob \| history
common/x86/cabac-a.asm		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/cabac.c		patch \| blob \| history
encoder/cavlc.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/macroblock.h		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/me.h		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/rdo.c		patch \| blob \| history
encoder/set.c		patch \| blob \| history
encoder/set.h		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
filters/video/depth.c		patch \| blob \| history
filters/video/resize.c		patch \| blob \| history
input/avs.c		patch \| blob \| history
input/input.c		patch \| blob \| history
input/input.h		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history