git.sesse.net Git - ffmpeg/blob - libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  */
  18
  19 #include "avcodec.h"
  20 #include "common.h"
  21 #include "dsputil.h"
  22 #include "snow.h"
  23
  24 #include "rangecoder.h"
  25
  26 #include "mpegvideo.h"
  27
  28 #undef NDEBUG
  29 #include <assert.h>
  30
  31 static const int8_t quant3[256]={
  32  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  41 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  48 };
  49 static const int8_t quant3b[256]={
  50  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  59 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 };
  67 static const int8_t quant3bA[256]={
  68  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  69  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  70  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84 };
  85 static const int8_t quant5[256]={
  86  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  87  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  88  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  95 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 102 };
 103 static const int8_t quant7[256]={
 104  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 105  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 107  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 108  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 113 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 118 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 119 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 120 };
 121 static const int8_t quant9[256]={
 122  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 123  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 124  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 125  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 131 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 137 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 138 };
 139 static const int8_t quant11[256]={
 140  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 141  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 142  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 143  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 144  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 154 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 155 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 156 };
 157 static const int8_t quant13[256]={
 158  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 159  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 160  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 161  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 162  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 163  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 167 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 171 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 173 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 174 };
 175
 176 #if 0 //64*cubic
 177 static const uint8_t obmc32[1024]={
 178  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 179  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 180  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 181  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 182  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 183  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 184  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 185  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 186  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 187  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 188  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 189  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 190  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 191  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 192  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 193  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 194  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 195  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 196  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 197  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 198  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 199  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 200  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 201  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 202  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 203  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 204  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 205  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 206  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 207  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 208  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 209  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 210 //error:0.000022
 211 };
 212 static const uint8_t obmc16[256]={
 213  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 214  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 215  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 216  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 217  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 218  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 219  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 220  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 221  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 222  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 223  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 224  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 225  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 226  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 227  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 228  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 229 //error:0.000033
 230 };
 231 #elif 1 // 64*linear
 232 static const uint8_t obmc32[1024]={
 233   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 234   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 235   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 236   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 237   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 238   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 239   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 240   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 241   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 242   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 243   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 244   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 245   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 246   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 247   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 248   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 249   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 250   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 251   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 252   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 253   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 254   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 255   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 256   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 257   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 258   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 259   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 260   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 261   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 262   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 263   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 264   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 265  //error:0.000020
 266 };
 267 static const uint8_t obmc16[256]={
 268   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 269   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 270   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 271   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 272   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 273  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 274  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 275  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 276  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 277  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 278  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 279   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 280   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 281   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 282   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 283   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 284 //error:0.000015
 285 };
 286 #else //64*cos
 287 static const uint8_t obmc32[1024]={
 288  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 289  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 290  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 291  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 292  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 293  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 294  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 295  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 296  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 297  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 298  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 299  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 300  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 301  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 302  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 303  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 304  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 305  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 306  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 307  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 308  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 309  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 310  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 311  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 312  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 313  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 314  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 315  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 316  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 317  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 318  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 319  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 320 //error:0.000022
 321 };
 322 static const uint8_t obmc16[256]={
 323  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 324  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 325  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 326  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 327  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 328  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 329  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 330  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 331  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 332  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 333  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 334  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 335  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 336  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 337  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 338  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 339 //error:0.000022
 340 };
 341 #endif
 342
 343 //linear *64
 344 static const uint8_t obmc8[64]={
 345   4, 12, 20, 28, 28, 20, 12,  4,
 346  12, 36, 60, 84, 84, 60, 36, 12,
 347  20, 60,100,140,140,100, 60, 20,
 348  28, 84,140,196,196,140, 84, 28,
 349  28, 84,140,196,196,140, 84, 28,
 350  20, 60,100,140,140,100, 60, 20,
 351  12, 36, 60, 84, 84, 60, 36, 12,
 352   4, 12, 20, 28, 28, 20, 12,  4,
 353 //error:0.000000
 354 };
 355
 356 //linear *64
 357 static const uint8_t obmc4[16]={
 358  16, 48, 48, 16,
 359  48,144,144, 48,
 360  48,144,144, 48,
 361  16, 48, 48, 16,
 362 //error:0.000000
 363 };
 364
 365 static const uint8_t *obmc_tab[4]={
 366     obmc32, obmc16, obmc8, obmc4
 367 };
 368
 369 typedef struct BlockNode{
 370     int16_t mx;
 371     int16_t my;
 372     uint8_t ref;
 373     uint8_t color[3];
 374     uint8_t type;
 375 //#define TYPE_SPLIT    1
 376 #define BLOCK_INTRA   1
 377 #define BLOCK_OPT     2
 378 //#define TYPE_NOCOLOR  4
 379     uint8_t level; //FIXME merge into type?
 380 }BlockNode;
 381
 382 static const BlockNode null_block= { //FIXME add border maybe
 383     .color= {128,128,128},
 384     .mx= 0,
 385     .my= 0,
 386     .ref= 0,
 387     .type= 0,
 388     .level= 0,
 389 };
 390
 391 #define LOG2_MB_SIZE 4
 392 #define MB_SIZE (1<<LOG2_MB_SIZE)
 393
 394 typedef struct x_and_coeff{
 395     int16_t x;
 396     uint16_t coeff;
 397 } x_and_coeff;
 398
 399 typedef struct SubBand{
 400     int level;
 401     int stride;
 402     int width;
 403     int height;
 404     int qlog;                                   ///< log(qscale)/log[2^(1/6)]
 405     DWTELEM *buf;
 406     int buf_x_offset;
 407     int buf_y_offset;
 408     int stride_line; ///< Stride measured in lines, not pixels.
 409     x_and_coeff * x_coeff;
 410     struct SubBand *parent;
 411     uint8_t state[/*7*2*/ 7 + 512][32];
 412 }SubBand;
 413
 414 typedef struct Plane{
 415     int width;
 416     int height;
 417     SubBand band[MAX_DECOMPOSITIONS][4];
 418 }Plane;
 419
 420 typedef struct SnowContext{
 421 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 422
 423     AVCodecContext *avctx;
 424     RangeCoder c;
 425     DSPContext dsp;
 426     AVFrame new_picture;
 427     AVFrame input_picture;              ///< new_picture with the internal linesizes
 428     AVFrame current_picture;
 429     AVFrame last_picture[MAX_REF_FRAMES];
 430     AVFrame mconly_picture;
 431 //     uint8_t q_context[16];
 432     uint8_t header_state[32];
 433     uint8_t block_state[128 + 32*128];
 434     int keyframe;
 435     int always_reset;
 436     int version;
 437     int spatial_decomposition_type;
 438     int temporal_decomposition_type;
 439     int spatial_decomposition_count;
 440     int temporal_decomposition_count;
 441     int max_ref_frames;
 442     int ref_frames;
 443     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 444     uint32_t *ref_scores[MAX_REF_FRAMES];
 445     DWTELEM *spatial_dwt_buffer;
 446     int colorspace_type;
 447     int chroma_h_shift;
 448     int chroma_v_shift;
 449     int spatial_scalability;
 450     int qlog;
 451     int lambda;
 452     int lambda2;
 453     int pass1_rc;
 454     int mv_scale;
 455     int qbias;
 456 #define QBIAS_SHIFT 3
 457     int b_width;
 458     int b_height;
 459     int block_max_depth;
 460     Plane plane[MAX_PLANES];
 461     BlockNode *block;
 462 #define ME_CACHE_SIZE 1024
 463     int me_cache[ME_CACHE_SIZE];
 464     int me_cache_generation;
 465     slice_buffer sb;
 466
 467     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 468 }SnowContext;
 469
 470 typedef struct {
 471     DWTELEM *b0;
 472     DWTELEM *b1;
 473     DWTELEM *b2;
 474     DWTELEM *b3;
 475     int y;
 476 } dwt_compose_t;
 477
 478 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 479 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 480
 481 static void iterative_me(SnowContext *s);
 482
 483 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
 484 {
 485     int i;
 486
 487     buf->base_buffer = base_buffer;
 488     buf->line_count = line_count;
 489     buf->line_width = line_width;
 490     buf->data_count = max_allocated_lines;
 491     buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
 492     buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
 493
 494     for (i = 0; i < max_allocated_lines; i++)
 495     {
 496       buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
 497     }
 498
 499     buf->data_stack_top = max_allocated_lines - 1;
 500 }
 501
 502 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 503 {
 504     int offset;
 505     DWTELEM * buffer;
 506
 507 //  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
 508
 509     assert(buf->data_stack_top >= 0);
 510 //  assert(!buf->line[line]);
 511     if (buf->line[line])
 512         return buf->line[line];
 513
 514     offset = buf->line_width * line;
 515     buffer = buf->data_stack[buf->data_stack_top];
 516     buf->data_stack_top--;
 517     buf->line[line] = buffer;
 518
 519 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 520
 521     return buffer;
 522 }
 523
 524 static void slice_buffer_release(slice_buffer * buf, int line)
 525 {
 526     int offset;
 527     DWTELEM * buffer;
 528
 529     assert(line >= 0 && line < buf->line_count);
 530     assert(buf->line[line]);
 531
 532     offset = buf->line_width * line;
 533     buffer = buf->line[line];
 534     buf->data_stack_top++;
 535     buf->data_stack[buf->data_stack_top] = buffer;
 536     buf->line[line] = NULL;
 537
 538 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 539 }
 540
 541 static void slice_buffer_flush(slice_buffer * buf)
 542 {
 543     int i;
 544     for (i = 0; i < buf->line_count; i++)
 545     {
 546         if (buf->line[i])
 547         {
 548 //      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
 549             slice_buffer_release(buf, i);
 550         }
 551     }
 552 }
 553
 554 static void slice_buffer_destroy(slice_buffer * buf)
 555 {
 556     int i;
 557     slice_buffer_flush(buf);
 558
 559     for (i = buf->data_count - 1; i >= 0; i--)
 560     {
 561         assert(buf->data_stack[i]);
 562         av_freep(&buf->data_stack[i]);
 563     }
 564     assert(buf->data_stack);
 565     av_freep(&buf->data_stack);
 566     assert(buf->line);
 567     av_freep(&buf->line);
 568 }
 569
 570 #ifdef __sgi
 571 // Avoid a name clash on SGI IRIX
 572 #undef qexp
 573 #endif
 574 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 575 static uint8_t qexp[QROOT];
 576
 577 static inline int mirror(int v, int m){
 578     while((unsigned)v > (unsigned)m){
 579         v=-v;
 580         if(v<0) v+= 2*m;
 581     }
 582     return v;
 583 }
 584
 585 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 586     int i;
 587
 588     if(v){
 589         const int a= ABS(v);
 590         const int e= av_log2(a);
 591 #if 1
 592         const int el= FFMIN(e, 10);
 593         put_rac(c, state+0, 0);
 594
 595         for(i=0; i<el; i++){
 596             put_rac(c, state+1+i, 1);  //1..10
 597         }
 598         for(; i<e; i++){
 599             put_rac(c, state+1+9, 1);  //1..10
 600         }
 601         put_rac(c, state+1+FFMIN(i,9), 0);
 602
 603         for(i=e-1; i>=el; i--){
 604             put_rac(c, state+22+9, (a>>i)&1); //22..31
 605         }
 606         for(; i>=0; i--){
 607             put_rac(c, state+22+i, (a>>i)&1); //22..31
 608         }
 609
 610         if(is_signed)
 611             put_rac(c, state+11 + el, v < 0); //11..21
 612 #else
 613
 614         put_rac(c, state+0, 0);
 615         if(e<=9){
 616             for(i=0; i<e; i++){
 617                 put_rac(c, state+1+i, 1);  //1..10
 618             }
 619             put_rac(c, state+1+i, 0);
 620
 621             for(i=e-1; i>=0; i--){
 622                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 623             }
 624
 625             if(is_signed)
 626                 put_rac(c, state+11 + e, v < 0); //11..21
 627         }else{
 628             for(i=0; i<e; i++){
 629                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 630             }
 631             put_rac(c, state+1+FFMIN(i,9), 0);
 632
 633             for(i=e-1; i>=0; i--){
 634                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 635             }
 636
 637             if(is_signed)
 638                 put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
 639         }
 640 #endif
 641     }else{
 642         put_rac(c, state+0, 1);
 643     }
 644 }
 645
 646 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 647     if(get_rac(c, state+0))
 648         return 0;
 649     else{
 650         int i, e, a;
 651         e= 0;
 652         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 653             e++;
 654         }
 655
 656         a= 1;
 657         for(i=e-1; i>=0; i--){
 658             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 659         }
 660
 661         if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
 662             return -a;
 663         else
 664             return a;
 665     }
 666 }
 667
 668 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 669     int i;
 670     int r= log2>=0 ? 1<<log2 : 1;
 671
 672     assert(v>=0);
 673     assert(log2>=-4);
 674
 675     while(v >= r){
 676         put_rac(c, state+4+log2, 1);
 677         v -= r;
 678         log2++;
 679         if(log2>0) r+=r;
 680     }
 681     put_rac(c, state+4+log2, 0);
 682
 683     for(i=log2-1; i>=0; i--){
 684         put_rac(c, state+31-i, (v>>i)&1);
 685     }
 686 }
 687
 688 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 689     int i;
 690     int r= log2>=0 ? 1<<log2 : 1;
 691     int v=0;
 692
 693     assert(log2>=-4);
 694
 695     while(get_rac(c, state+4+log2)){
 696         v+= r;
 697         log2++;
 698         if(log2>0) r+=r;
 699     }
 700
 701     for(i=log2-1; i>=0; i--){
 702         v+= get_rac(c, state+31-i)<<i;
 703     }
 704
 705     return v;
 706 }
 707
 708 static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 709     const int mirror_left= !highpass;
 710     const int mirror_right= (width&1) ^ highpass;
 711     const int w= (width>>1) - 1 + (highpass & width);
 712     int i;
 713
 714 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 715     if(mirror_left){
 716         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 717         dst += dst_step;
 718         src += src_step;
 719     }
 720
 721     for(i=0; i<w; i++){
 722         dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
 723     }
 724
 725     if(mirror_right){
 726         dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
 727     }
 728 }
 729
 730 #ifndef lift5
 731 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 732     const int mirror_left= !highpass;
 733     const int mirror_right= (width&1) ^ highpass;
 734     const int w= (width>>1) - 1 + (highpass & width);
 735     int i;
 736
 737     if(mirror_left){
 738         int r= 3*2*ref[0];
 739         r += r>>4;
 740         r += r>>8;
 741         dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
 742         dst += dst_step;
 743         src += src_step;
 744     }
 745
 746     for(i=0; i<w; i++){
 747         int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
 748         r += r>>4;
 749         r += r>>8;
 750         dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
 751     }
 752
 753     if(mirror_right){
 754         int r= 3*2*ref[w*ref_step];
 755         r += r>>4;
 756         r += r>>8;
 757         dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
 758     }
 759 }
 760 #endif
 761
 762 #ifndef liftS
 763 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 764     const int mirror_left= !highpass;
 765     const int mirror_right= (width&1) ^ highpass;
 766     const int w= (width>>1) - 1 + (highpass & width);
 767     int i;
 768
 769     assert(shift == 4);
 770 #define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
 771     if(mirror_left){
 772         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 773         dst += dst_step;
 774         src += src_step;
 775     }
 776
 777     for(i=0; i<w; i++){
 778         dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
 779     }
 780
 781     if(mirror_right){
 782         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 783     }
 784 }
 785 #endif
 786
 787
 788 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
 789     int x, i;
 790
 791     for(x=start; x<width; x+=2){
 792         int64_t sum=0;
 793
 794         for(i=0; i<n; i++){
 795             int x2= x + 2*i - n + 1;
 796             if     (x2<     0) x2= -x2;
 797             else if(x2>=width) x2= 2*width-x2-2;
 798             sum += coeffs[i]*(int64_t)dst[x2];
 799         }
 800         if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
 801         else        dst[x] += (sum + (1<<shift)/2)>>shift;
 802     }
 803 }
 804
 805 static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
 806     int x, y, i;
 807     for(y=start; y<height; y+=2){
 808         for(x=0; x<width; x++){
 809             int64_t sum=0;
 810
 811             for(i=0; i<n; i++){
 812                 int y2= y + 2*i - n + 1;
 813                 if     (y2<      0) y2= -y2;
 814                 else if(y2>=height) y2= 2*height-y2-2;
 815                 sum += coeffs[i]*(int64_t)dst[x + y2*stride];
 816             }
 817             if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
 818             else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
 819         }
 820     }
 821 }
 822
 823 #define SCALEX 1
 824 #define LX0 0
 825 #define LX1 1
 826
 827 #if 0 // more accurate 9/7
 828 #define N1 2
 829 #define SHIFT1 14
 830 #define COEFFS1 (int[]){-25987,-25987}
 831 #define N2 2
 832 #define SHIFT2 19
 833 #define COEFFS2 (int[]){-27777,-27777}
 834 #define N3 2
 835 #define SHIFT3 15
 836 #define COEFFS3 (int[]){28931,28931}
 837 #define N4 2
 838 #define SHIFT4 15
 839 #define COEFFS4 (int[]){14533,14533}
 840 #elif 1 // 13/7 CRF
 841 #define N1 4
 842 #define SHIFT1 4
 843 #define COEFFS1 (int[]){1,-9,-9,1}
 844 #define N2 4
 845 #define SHIFT2 4
 846 #define COEFFS2 (int[]){-1,5,5,-1}
 847 #define N3 0
 848 #define SHIFT3 1
 849 #define COEFFS3 NULL
 850 #define N4 0
 851 #define SHIFT4 1
 852 #define COEFFS4 NULL
 853 #elif 1 // 3/5
 854 #define LX0 1
 855 #define LX1 0
 856 #define SCALEX 0.5
 857 #define N1 2
 858 #define SHIFT1 1
 859 #define COEFFS1 (int[]){1,1}
 860 #define N2 2
 861 #define SHIFT2 2
 862 #define COEFFS2 (int[]){-1,-1}
 863 #define N3 0
 864 #define SHIFT3 0
 865 #define COEFFS3 NULL
 866 #define N4 0
 867 #define SHIFT4 0
 868 #define COEFFS4 NULL
 869 #elif 1 // 11/5
 870 #define N1 0
 871 #define SHIFT1 1
 872 #define COEFFS1 NULL
 873 #define N2 2
 874 #define SHIFT2 2
 875 #define COEFFS2 (int[]){-1,-1}
 876 #define N3 2
 877 #define SHIFT3 0
 878 #define COEFFS3 (int[]){-1,-1}
 879 #define N4 4
 880 #define SHIFT4 7
 881 #define COEFFS4 (int[]){-5,29,29,-5}
 882 #define SCALEX 4
 883 #elif 1 // 9/7 CDF
 884 #define N1 2
 885 #define SHIFT1 7
 886 #define COEFFS1 (int[]){-203,-203}
 887 #define N2 2
 888 #define SHIFT2 12
 889 #define COEFFS2 (int[]){-217,-217}
 890 #define N3 2
 891 #define SHIFT3 7
 892 #define COEFFS3 (int[]){113,113}
 893 #define N4 2
 894 #define SHIFT4 9
 895 #define COEFFS4 (int[]){227,227}
 896 #define SCALEX 1
 897 #elif 1 // 7/5 CDF
 898 #define N1 0
 899 #define SHIFT1 1
 900 #define COEFFS1 NULL
 901 #define N2 2
 902 #define SHIFT2 2
 903 #define COEFFS2 (int[]){-1,-1}
 904 #define N3 2
 905 #define SHIFT3 0
 906 #define COEFFS3 (int[]){-1,-1}
 907 #define N4 2
 908 #define SHIFT4 4
 909 #define COEFFS4 (int[]){3,3}
 910 #elif 1 // 9/7 MN
 911 #define N1 4
 912 #define SHIFT1 4
 913 #define COEFFS1 (int[]){1,-9,-9,1}
 914 #define N2 2
 915 #define SHIFT2 2
 916 #define COEFFS2 (int[]){1,1}
 917 #define N3 0
 918 #define SHIFT3 1
 919 #define COEFFS3 NULL
 920 #define N4 0
 921 #define SHIFT4 1
 922 #define COEFFS4 NULL
 923 #else // 13/7 CRF
 924 #define N1 4
 925 #define SHIFT1 4
 926 #define COEFFS1 (int[]){1,-9,-9,1}
 927 #define N2 4
 928 #define SHIFT2 4
 929 #define COEFFS2 (int[]){-1,5,5,-1}
 930 #define N3 0
 931 #define SHIFT3 1
 932 #define COEFFS3 NULL
 933 #define N4 0
 934 #define SHIFT4 1
 935 #define COEFFS4 NULL
 936 #endif
 937 static void horizontal_decomposeX(DWTELEM *b, int width){
 938     DWTELEM temp[width];
 939     const int width2= width>>1;
 940     const int w2= (width+1)>>1;
 941     int x;
 942
 943     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
 944     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
 945     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
 946     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
 947
 948     for(x=0; x<width2; x++){
 949         temp[x   ]= b[2*x    ];
 950         temp[x+w2]= b[2*x + 1];
 951     }
 952     if(width&1)
 953         temp[x   ]= b[2*x    ];
 954     memcpy(b, temp, width*sizeof(int));
 955 }
 956
 957 static void horizontal_composeX(DWTELEM *b, int width){
 958     DWTELEM temp[width];
 959     const int width2= width>>1;
 960     int x;
 961     const int w2= (width+1)>>1;
 962
 963     memcpy(temp, b, width*sizeof(int));
 964     for(x=0; x<width2; x++){
 965         b[2*x    ]= temp[x   ];
 966         b[2*x + 1]= temp[x+w2];
 967     }
 968     if(width&1)
 969         b[2*x    ]= temp[x   ];
 970
 971     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
 972     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
 973     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
 974     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
 975 }
 976
 977 static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
 978     int x, y;
 979
 980     for(y=0; y<height; y++){
 981         for(x=0; x<width; x++){
 982             buffer[y*stride + x] *= SCALEX;
 983         }
 984     }
 985
 986     for(y=0; y<height; y++){
 987         horizontal_decomposeX(buffer + y*stride, width);
 988     }
 989
 990     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
 991     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
 992     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
 993     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
 994 }
 995
 996 static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
 997     int x, y;
 998
 999     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1000     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1001     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1002     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1003
1004     for(y=0; y<height; y++){
1005         horizontal_composeX(buffer + y*stride, width);
1006     }
1007
1008     for(y=0; y<height; y++){
1009         for(x=0; x<width; x++){
1010             buffer[y*stride + x] /= SCALEX;
1011         }
1012     }
1013 }
1014
1015 static void horizontal_decompose53i(DWTELEM *b, int width){
1016     DWTELEM temp[width];
1017     const int width2= width>>1;
1018     int x;
1019     const int w2= (width+1)>>1;
1020
1021     for(x=0; x<width2; x++){
1022         temp[x   ]= b[2*x    ];
1023         temp[x+w2]= b[2*x + 1];
1024     }
1025     if(width&1)
1026         temp[x   ]= b[2*x    ];
1027 #if 0
1028     {
1029     int A1,A2,A3,A4;
1030     A2= temp[1       ];
1031     A4= temp[0       ];
1032     A1= temp[0+width2];
1033     A1 -= (A2 + A4)>>1;
1034     A4 += (A1 + 1)>>1;
1035     b[0+width2] = A1;
1036     b[0       ] = A4;
1037     for(x=1; x+1<width2; x+=2){
1038         A3= temp[x+width2];
1039         A4= temp[x+1     ];
1040         A3 -= (A2 + A4)>>1;
1041         A2 += (A1 + A3 + 2)>>2;
1042         b[x+width2] = A3;
1043         b[x       ] = A2;
1044
1045         A1= temp[x+1+width2];
1046         A2= temp[x+2       ];
1047         A1 -= (A2 + A4)>>1;
1048         A4 += (A1 + A3 + 2)>>2;
1049         b[x+1+width2] = A1;
1050         b[x+1       ] = A4;
1051     }
1052     A3= temp[width-1];
1053     A3 -= A2;
1054     A2 += (A1 + A3 + 2)>>2;
1055     b[width -1] = A3;
1056     b[width2-1] = A2;
1057     }
1058 #else
1059     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1060     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1061 #endif
1062 }
1063
1064 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1065     int i;
1066
1067     for(i=0; i<width; i++){
1068         b1[i] -= (b0[i] + b2[i])>>1;
1069     }
1070 }
1071
1072 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1073     int i;
1074
1075     for(i=0; i<width; i++){
1076         b1[i] += (b0[i] + b2[i] + 2)>>2;
1077     }
1078 }
1079
1080 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1081     int y;
1082     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1083     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1084
1085     for(y=-2; y<height; y+=2){
1086         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1087         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1088
1089 {START_TIMER
1090         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1091         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1092 STOP_TIMER("horizontal_decompose53i")}
1093
1094 {START_TIMER
1095         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1096         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1097 STOP_TIMER("vertical_decompose53i*")}
1098
1099         b0=b2;
1100         b1=b3;
1101     }
1102 }
1103
1104 static void horizontal_decompose97i(DWTELEM *b, int width){
1105     DWTELEM temp[width];
1106     const int w2= (width+1)>>1;
1107
1108     lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1109     liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1110     lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1111     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1112 }
1113
1114
1115 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1116     int i;
1117
1118     for(i=0; i<width; i++){
1119         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1120     }
1121 }
1122
1123 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1124     int i;
1125
1126     for(i=0; i<width; i++){
1127 #ifdef lift5
1128         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1129 #else
1130         int r= 3*(b0[i] + b2[i]);
1131         r+= r>>4;
1132         r+= r>>8;
1133         b1[i] += (r+W_CO)>>W_CS;
1134 #endif
1135     }
1136 }
1137
1138 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1139     int i;
1140
1141     for(i=0; i<width; i++){
1142 #ifdef liftS
1143         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1144 #else
1145         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1146 #endif
1147     }
1148 }
1149
1150 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1151     int i;
1152
1153     for(i=0; i<width; i++){
1154         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1155     }
1156 }
1157
1158 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1159     int y;
1160     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1161     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1162     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1163     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1164
1165     for(y=-4; y<height; y+=2){
1166         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1167         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1168
1169 {START_TIMER
1170         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1171         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1172 if(width>400){
1173 STOP_TIMER("horizontal_decompose97i")
1174 }}
1175
1176 {START_TIMER
1177         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1178         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1179         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1180         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1181
1182 if(width>400){
1183 STOP_TIMER("vertical_decompose97i")
1184 }}
1185
1186         b0=b2;
1187         b1=b3;
1188         b2=b4;
1189         b3=b5;
1190     }
1191 }
1192
1193 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1194     int level;
1195
1196     for(level=0; level<decomposition_count; level++){
1197         switch(type){
1198         case 0: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1199         case 1: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1200         case 2: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1201         }
1202     }
1203 }
1204
1205 static void horizontal_compose53i(DWTELEM *b, int width){
1206     DWTELEM temp[width];
1207     const int width2= width>>1;
1208     const int w2= (width+1)>>1;
1209     int x;
1210
1211 #if 0
1212     int A1,A2,A3,A4;
1213     A2= temp[1       ];
1214     A4= temp[0       ];
1215     A1= temp[0+width2];
1216     A1 -= (A2 + A4)>>1;
1217     A4 += (A1 + 1)>>1;
1218     b[0+width2] = A1;
1219     b[0       ] = A4;
1220     for(x=1; x+1<width2; x+=2){
1221         A3= temp[x+width2];
1222         A4= temp[x+1     ];
1223         A3 -= (A2 + A4)>>1;
1224         A2 += (A1 + A3 + 2)>>2;
1225         b[x+width2] = A3;
1226         b[x       ] = A2;
1227
1228         A1= temp[x+1+width2];
1229         A2= temp[x+2       ];
1230         A1 -= (A2 + A4)>>1;
1231         A4 += (A1 + A3 + 2)>>2;
1232         b[x+1+width2] = A1;
1233         b[x+1       ] = A4;
1234     }
1235     A3= temp[width-1];
1236     A3 -= A2;
1237     A2 += (A1 + A3 + 2)>>2;
1238     b[width -1] = A3;
1239     b[width2-1] = A2;
1240 #else
1241     lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1242     lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1243 #endif
1244     for(x=0; x<width2; x++){
1245         b[2*x    ]= temp[x   ];
1246         b[2*x + 1]= temp[x+w2];
1247     }
1248     if(width&1)
1249         b[2*x    ]= temp[x   ];
1250 }
1251
1252 static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1253     int i;
1254
1255     for(i=0; i<width; i++){
1256         b1[i] += (b0[i] + b2[i])>>1;
1257     }
1258 }
1259
1260 static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1261     int i;
1262
1263     for(i=0; i<width; i++){
1264         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1265     }
1266 }
1267
1268 static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1269     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1270     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1271     cs->y = -1;
1272 }
1273
1274 static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1275     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1276     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1277     cs->y = -1;
1278 }
1279
1280 static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1281     int y= cs->y;
1282
1283     DWTELEM *b0= cs->b0;
1284     DWTELEM *b1= cs->b1;
1285     DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1286     DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1287
1288 {START_TIMER
1289         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1290         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1291 STOP_TIMER("vertical_compose53i*")}
1292
1293 {START_TIMER
1294         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1295         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1296 STOP_TIMER("horizontal_compose53i")}
1297
1298     cs->b0 = b2;
1299     cs->b1 = b3;
1300     cs->y += 2;
1301 }
1302
1303 static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1304     int y= cs->y;
1305     DWTELEM *b0= cs->b0;
1306     DWTELEM *b1= cs->b1;
1307     DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1308     DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1309
1310 {START_TIMER
1311         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1312         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1313 STOP_TIMER("vertical_compose53i*")}
1314
1315 {START_TIMER
1316         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1317         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1318 STOP_TIMER("horizontal_compose53i")}
1319
1320     cs->b0 = b2;
1321     cs->b1 = b3;
1322     cs->y += 2;
1323 }
1324
1325 static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1326     dwt_compose_t cs;
1327     spatial_compose53i_init(&cs, buffer, height, stride);
1328     while(cs.y <= height)
1329         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1330 }
1331
1332
1333 void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1334     DWTELEM temp[width];
1335     const int w2= (width+1)>>1;
1336
1337     lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1338     lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1339     liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1340     lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1341 }
1342
1343 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1344     int i;
1345
1346     for(i=0; i<width; i++){
1347         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1348     }
1349 }
1350
1351 static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1352     int i;
1353
1354     for(i=0; i<width; i++){
1355 #ifdef lift5
1356         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1357 #else
1358         int r= 3*(b0[i] + b2[i]);
1359         r+= r>>4;
1360         r+= r>>8;
1361         b1[i] -= (r+W_CO)>>W_CS;
1362 #endif
1363     }
1364 }
1365
1366 static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1367     int i;
1368
1369     for(i=0; i<width; i++){
1370 #ifdef liftS
1371         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1372 #else
1373         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1374 #endif
1375     }
1376 }
1377
1378 static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1379     int i;
1380
1381     for(i=0; i<width; i++){
1382         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1383     }
1384 }
1385
1386 void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1387     int i;
1388
1389     for(i=0; i<width; i++){
1390 #ifndef lift5
1391         int r;
1392 #endif
1393         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1394 #ifdef lift5
1395         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1396 #else
1397         r= 3*(b2[i] + b4[i]);
1398         r+= r>>4;
1399         r+= r>>8;
1400         b3[i] -= (r+W_CO)>>W_CS;
1401 #endif
1402 #ifdef liftS
1403         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1404 #else
1405         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1406 #endif
1407         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1408     }
1409 }
1410
1411 static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1412     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1413     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1414     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1415     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1416     cs->y = -3;
1417 }
1418
1419 static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1420     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1421     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1422     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1423     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1424     cs->y = -3;
1425 }
1426
1427 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1428     int y = cs->y;
1429
1430     DWTELEM *b0= cs->b0;
1431     DWTELEM *b1= cs->b1;
1432     DWTELEM *b2= cs->b2;
1433     DWTELEM *b3= cs->b3;
1434     DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1435     DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1436
1437 {START_TIMER
1438     if(y>0 && y+4<height){
1439         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1440     }else{
1441         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1442         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1443         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1444         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1445     }
1446 if(width>400){
1447 STOP_TIMER("vertical_compose97i")}}
1448
1449 {START_TIMER
1450         if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1451         if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1452 if(width>400 && y+0<(unsigned)height){
1453 STOP_TIMER("horizontal_compose97i")}}
1454
1455     cs->b0=b2;
1456     cs->b1=b3;
1457     cs->b2=b4;
1458     cs->b3=b5;
1459     cs->y += 2;
1460 }
1461
1462 static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1463     int y = cs->y;
1464     DWTELEM *b0= cs->b0;
1465     DWTELEM *b1= cs->b1;
1466     DWTELEM *b2= cs->b2;
1467     DWTELEM *b3= cs->b3;
1468     DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1469     DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1470
1471 {START_TIMER
1472         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1473         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1474         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1475         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1476 if(width>400){
1477 STOP_TIMER("vertical_compose97i")}}
1478
1479 {START_TIMER
1480         if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1481         if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1482 if(width>400 && b0 <= b2){
1483 STOP_TIMER("horizontal_compose97i")}}
1484
1485     cs->b0=b2;
1486     cs->b1=b3;
1487     cs->b2=b4;
1488     cs->b3=b5;
1489     cs->y += 2;
1490 }
1491
1492 static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1493     dwt_compose_t cs;
1494     spatial_compose97i_init(&cs, buffer, height, stride);
1495     while(cs.y <= height)
1496         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1497 }
1498
1499 static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1500     int level;
1501     for(level=decomposition_count-1; level>=0; level--){
1502         switch(type){
1503         case 0: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1504         case 1: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1505         /* not slicified yet */
1506         case 2: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1507           av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1508         }
1509     }
1510 }
1511
1512 static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1513     int level;
1514     for(level=decomposition_count-1; level>=0; level--){
1515         switch(type){
1516         case 0: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1517         case 1: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1518         /* not slicified yet */
1519         case 2: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1520         }
1521     }
1522 }
1523
1524 static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1525     const int support = type==1 ? 3 : 5;
1526     int level;
1527     if(type==2) return;
1528
1529     for(level=decomposition_count-1; level>=0; level--){
1530         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1531             switch(type){
1532             case 0: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1533                     break;
1534             case 1: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1535                     break;
1536             case 2: break;
1537             }
1538         }
1539     }
1540 }
1541
1542 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1543     const int support = type==1 ? 3 : 5;
1544     int level;
1545     if(type==2) return;
1546
1547     for(level=decomposition_count-1; level>=0; level--){
1548         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1549             switch(type){
1550             case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1551                     break;
1552             case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1553                     break;
1554             case 2: break;
1555             }
1556         }
1557     }
1558 }
1559
1560 static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1561     if(type==2){
1562         int level;
1563         for(level=decomposition_count-1; level>=0; level--)
1564             spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1565     }else{
1566         dwt_compose_t cs[MAX_DECOMPOSITIONS];
1567         int y;
1568         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1569         for(y=0; y<height; y+=4)
1570             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1571     }
1572 }
1573
1574 static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1575     const int w= b->width;
1576     const int h= b->height;
1577     int x, y;
1578
1579     if(1){
1580         int run=0;
1581         int runs[w*h];
1582         int run_index=0;
1583         int max_index;
1584
1585         for(y=0; y<h; y++){
1586             for(x=0; x<w; x++){
1587                 int v, p=0;
1588                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1589                 v= src[x + y*stride];
1590
1591                 if(y){
1592                     t= src[x + (y-1)*stride];
1593                     if(x){
1594                         lt= src[x - 1 + (y-1)*stride];
1595                     }
1596                     if(x + 1 < w){
1597                         rt= src[x + 1 + (y-1)*stride];
1598                     }
1599                 }
1600                 if(x){
1601                     l= src[x - 1 + y*stride];
1602                     /*if(x > 1){
1603                         if(orientation==1) ll= src[y + (x-2)*stride];
1604                         else               ll= src[x - 2 + y*stride];
1605                     }*/
1606                 }
1607                 if(parent){
1608                     int px= x>>1;
1609                     int py= y>>1;
1610                     if(px<b->parent->width && py<b->parent->height)
1611                         p= parent[px + py*2*stride];
1612                 }
1613                 if(!(/*ll|*/l|lt|t|rt|p)){
1614                     if(v){
1615                         runs[run_index++]= run;
1616                         run=0;
1617                     }else{
1618                         run++;
1619                     }
1620                 }
1621             }
1622         }
1623         max_index= run_index;
1624         runs[run_index++]= run;
1625         run_index=0;
1626         run= runs[run_index++];
1627
1628         put_symbol2(&s->c, b->state[30], max_index, 0);
1629         if(run_index <= max_index)
1630             put_symbol2(&s->c, b->state[1], run, 3);
1631
1632         for(y=0; y<h; y++){
1633             if(s->c.bytestream_end - s->c.bytestream < w*40){
1634                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1635                 return -1;
1636             }
1637             for(x=0; x<w; x++){
1638                 int v, p=0;
1639                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1640                 v= src[x + y*stride];
1641
1642                 if(y){
1643                     t= src[x + (y-1)*stride];
1644                     if(x){
1645                         lt= src[x - 1 + (y-1)*stride];
1646                     }
1647                     if(x + 1 < w){
1648                         rt= src[x + 1 + (y-1)*stride];
1649                     }
1650                 }
1651                 if(x){
1652                     l= src[x - 1 + y*stride];
1653                     /*if(x > 1){
1654                         if(orientation==1) ll= src[y + (x-2)*stride];
1655                         else               ll= src[x - 2 + y*stride];
1656                     }*/
1657                 }
1658                 if(parent){
1659                     int px= x>>1;
1660                     int py= y>>1;
1661                     if(px<b->parent->width && py<b->parent->height)
1662                         p= parent[px + py*2*stride];
1663                 }
1664                 if(/*ll|*/l|lt|t|rt|p){
1665                     int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1666
1667                     put_rac(&s->c, &b->state[0][context], !!v);
1668                 }else{
1669                     if(!run){
1670                         run= runs[run_index++];
1671
1672                         if(run_index <= max_index)
1673                             put_symbol2(&s->c, b->state[1], run, 3);
1674                         assert(v);
1675                     }else{
1676                         run--;
1677                         assert(!v);
1678                     }
1679                 }
1680                 if(v){
1681                     int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1682                     int l2= 2*ABS(l) + (l<0);
1683                     int t2= 2*ABS(t) + (t<0);
1684
1685                     put_symbol2(&s->c, b->state[context + 2], ABS(v)-1, context-4);
1686                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1687                 }
1688             }
1689         }
1690     }
1691     return 0;
1692 }
1693
1694 static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1695 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1696 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1697     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1698 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1699 }
1700
1701 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1702     const int w= b->width;
1703     const int h= b->height;
1704     int x,y;
1705
1706     if(1){
1707         int run, runs;
1708         x_and_coeff *xc= b->x_coeff;
1709         x_and_coeff *prev_xc= NULL;
1710         x_and_coeff *prev2_xc= xc;
1711         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1712         x_and_coeff *prev_parent_xc= parent_xc;
1713
1714         runs= get_symbol2(&s->c, b->state[30], 0);
1715         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1716         else           run= INT_MAX;
1717
1718         for(y=0; y<h; y++){
1719             int v=0;
1720             int lt=0, t=0, rt=0;
1721
1722             if(y && prev_xc->x == 0){
1723                 rt= prev_xc->coeff;
1724             }
1725             for(x=0; x<w; x++){
1726                 int p=0;
1727                 const int l= v;
1728
1729                 lt= t; t= rt;
1730
1731                 if(y){
1732                     if(prev_xc->x <= x)
1733                         prev_xc++;
1734                     if(prev_xc->x == x + 1)
1735                         rt= prev_xc->coeff;
1736                     else
1737                         rt=0;
1738                 }
1739                 if(parent_xc){
1740                     if(x>>1 > parent_xc->x){
1741                         parent_xc++;
1742                     }
1743                     if(x>>1 == parent_xc->x){
1744                         p= parent_xc->coeff;
1745                     }
1746                 }
1747                 if(/*ll|*/l|lt|t|rt|p){
1748                     int context= av_log2(/*ABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1749
1750                     v=get_rac(&s->c, &b->state[0][context]);
1751                     if(v){
1752                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1753                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1754
1755                         xc->x=x;
1756                         (xc++)->coeff= v;
1757                     }
1758                 }else{
1759                     if(!run){
1760                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1761                         else           run= INT_MAX;
1762                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1763                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1764
1765                         xc->x=x;
1766                         (xc++)->coeff= v;
1767                     }else{
1768                         int max_run;
1769                         run--;
1770                         v=0;
1771
1772                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1773                         else  max_run= FFMIN(run, w-x-1);
1774                         if(parent_xc)
1775                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1776                         x+= max_run;
1777                         run-= max_run;
1778                     }
1779                 }
1780             }
1781             (xc++)->x= w+1; //end marker
1782             prev_xc= prev2_xc;
1783             prev2_xc= xc;
1784
1785             if(parent_xc){
1786                 if(y&1){
1787                     while(parent_xc->x != parent->width+1)
1788                         parent_xc++;
1789                     parent_xc++;
1790                     prev_parent_xc= parent_xc;
1791                 }else{
1792                     parent_xc= prev_parent_xc;
1793                 }
1794             }
1795         }
1796
1797         (xc++)->x= w+1; //end marker
1798     }
1799 }
1800
1801 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1802     const int w= b->width;
1803     int y;
1804     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
1805     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1806     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1807     int new_index = 0;
1808
1809     START_TIMER
1810
1811     if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1812         qadd= 0;
1813         qmul= 1<<QEXPSHIFT;
1814     }
1815
1816     /* If we are on the second or later slice, restore our index. */
1817     if (start_y != 0)
1818         new_index = save_state[0];
1819
1820
1821     for(y=start_y; y<h; y++){
1822         int x = 0;
1823         int v;
1824         DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1825         memset(line, 0, b->width*sizeof(DWTELEM));
1826         v = b->x_coeff[new_index].coeff;
1827         x = b->x_coeff[new_index++].x;
1828         while(x < w)
1829         {
1830             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1831             register int u= -(v&1);
1832             line[x] = (t^u) - u;
1833
1834             v = b->x_coeff[new_index].coeff;
1835             x = b->x_coeff[new_index++].x;
1836         }
1837     }
1838     if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1839         STOP_TIMER("decode_subband")
1840     }
1841
1842     /* Save our variables for the next slice. */
1843     save_state[0] = new_index;
1844
1845     return;
1846 }
1847
1848 static void reset_contexts(SnowContext *s){
1849     int plane_index, level, orientation;
1850
1851     for(plane_index=0; plane_index<3; plane_index++){
1852         for(level=0; level<s->spatial_decomposition_count; level++){
1853             for(orientation=level ? 1:0; orientation<4; orientation++){
1854                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1855             }
1856         }
1857     }
1858     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1859     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1860 }
1861
1862 static int alloc_blocks(SnowContext *s){
1863     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1864     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1865
1866     s->b_width = w;
1867     s->b_height= h;
1868
1869     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1870     return 0;
1871 }
1872
1873 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1874     uint8_t *bytestream= d->bytestream;
1875     uint8_t *bytestream_start= d->bytestream_start;
1876     *d= *s;
1877     d->bytestream= bytestream;
1878     d->bytestream_start= bytestream_start;
1879 }
1880
1881 //near copy & paste from dsputil, FIXME
1882 static int pix_sum(uint8_t * pix, int line_size, int w)
1883 {
1884     int s, i, j;
1885
1886     s = 0;
1887     for (i = 0; i < w; i++) {
1888         for (j = 0; j < w; j++) {
1889             s += pix[0];
1890             pix ++;
1891         }
1892         pix += line_size - w;
1893     }
1894     return s;
1895 }
1896
1897 //near copy & paste from dsputil, FIXME
1898 static int pix_norm1(uint8_t * pix, int line_size, int w)
1899 {
1900     int s, i, j;
1901     uint32_t *sq = squareTbl + 256;
1902
1903     s = 0;
1904     for (i = 0; i < w; i++) {
1905         for (j = 0; j < w; j ++) {
1906             s += sq[pix[0]];
1907             pix ++;
1908         }
1909         pix += line_size - w;
1910     }
1911     return s;
1912 }
1913
1914 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1915     const int w= s->b_width << s->block_max_depth;
1916     const int rem_depth= s->block_max_depth - level;
1917     const int index= (x + y*w) << rem_depth;
1918     const int block_w= 1<<rem_depth;
1919     BlockNode block;
1920     int i,j;
1921
1922     block.color[0]= l;
1923     block.color[1]= cb;
1924     block.color[2]= cr;
1925     block.mx= mx;
1926     block.my= my;
1927     block.ref= ref;
1928     block.type= type;
1929     block.level= level;
1930
1931     for(j=0; j<block_w; j++){
1932         for(i=0; i<block_w; i++){
1933             s->block[index + i + j*w]= block;
1934         }
1935     }
1936 }
1937
1938 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1939     const int offset[3]= {
1940           y*c->  stride + x,
1941         ((y*c->uvstride + x)>>1),
1942         ((y*c->uvstride + x)>>1),
1943     };
1944     int i;
1945     for(i=0; i<3; i++){
1946         c->src[0][i]= src [i];
1947         c->ref[0][i]= ref [i] + offset[i];
1948     }
1949     assert(!ref_index);
1950 }
1951
1952 //FIXME copy&paste
1953 #define P_LEFT P[1]
1954 #define P_TOP P[2]
1955 #define P_TOPRIGHT P[3]
1956 #define P_MEDIAN P[4]
1957 #define P_MV1 P[9]
1958 #define FLAG_QPEL   1 //must be 1
1959
1960 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1961     uint8_t p_buffer[1024];
1962     uint8_t i_buffer[1024];
1963     uint8_t p_state[sizeof(s->block_state)];
1964     uint8_t i_state[sizeof(s->block_state)];
1965     RangeCoder pc, ic;
1966     uint8_t *pbbak= s->c.bytestream;
1967     uint8_t *pbbak_start= s->c.bytestream_start;
1968     int score, score2, iscore, i_len, p_len, block_s, sum;
1969     const int w= s->b_width  << s->block_max_depth;
1970     const int h= s->b_height << s->block_max_depth;
1971     const int rem_depth= s->block_max_depth - level;
1972     const int index= (x + y*w) << rem_depth;
1973     const int block_w= 1<<(LOG2_MB_SIZE - level);
1974     int trx= (x+1)<<rem_depth;
1975     int try= (y+1)<<rem_depth;
1976     BlockNode *left  = x ? &s->block[index-1] : &null_block;
1977     BlockNode *top   = y ? &s->block[index-w] : &null_block;
1978     BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1979     BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
1980     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1981     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1982     int pl = left->color[0];
1983     int pcb= left->color[1];
1984     int pcr= left->color[2];
1985     int pmx= mid_pred(left->mx, top->mx, tr->mx);
1986     int pmy= mid_pred(left->my, top->my, tr->my);
1987     int mx=0, my=0;
1988     int l,cr,cb;
1989     const int stride= s->current_picture.linesize[0];
1990     const int uvstride= s->current_picture.linesize[1];
1991     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
1992                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
1993                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
1994     int P[10][2];
1995     int16_t last_mv[3][2];
1996     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
1997     const int shift= 1+qpel;
1998     MotionEstContext *c= &s->m.me;
1999     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2000     int mx_context= av_log2(2*ABS(left->mx - top->mx));
2001     int my_context= av_log2(2*ABS(left->my - top->my));
2002     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2003     int ref, best_ref, ref_score, ref_mx, ref_my;
2004
2005     assert(sizeof(s->block_state) >= 256);
2006     if(s->keyframe){
2007         set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, 0, BLOCK_INTRA);
2008         return 0;
2009     }
2010
2011 //    clip predictors / edge ?
2012
2013     P_LEFT[0]= left->mx;
2014     P_LEFT[1]= left->my;
2015     P_TOP [0]= top->mx;
2016     P_TOP [1]= top->my;
2017     P_TOPRIGHT[0]= tr->mx;
2018     P_TOPRIGHT[1]= tr->my;
2019
2020     last_mv[0][0]= s->block[index].mx;
2021     last_mv[0][1]= s->block[index].my;
2022     last_mv[1][0]= right->mx;
2023     last_mv[1][1]= right->my;
2024     last_mv[2][0]= bottom->mx;
2025     last_mv[2][1]= bottom->my;
2026
2027     s->m.mb_stride=2;
2028     s->m.mb_x=
2029     s->m.mb_y= 0;
2030     s->m.me.skip= 0;
2031
2032     assert(s->m.me.  stride ==   stride);
2033     assert(s->m.me.uvstride == uvstride);
2034
2035     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2036     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2037     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2038     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2039
2040     c->xmin = - x*block_w - 16+2;
2041     c->ymin = - y*block_w - 16+2;
2042     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2043     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2044
2045     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2046     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2047     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2048     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2049     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2050     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2051     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2052
2053     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2054     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2055
2056     if (!y) {
2057         c->pred_x= P_LEFT[0];
2058         c->pred_y= P_LEFT[1];
2059     } else {
2060         c->pred_x = P_MEDIAN[0];
2061         c->pred_y = P_MEDIAN[1];
2062     }
2063
2064     score= INT_MAX;
2065     best_ref= 0;
2066     for(ref=0; ref<s->ref_frames; ref++){
2067         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2068
2069         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2070                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2071
2072         assert(ref_mx >= c->xmin);
2073         assert(ref_mx <= c->xmax);
2074         assert(ref_my >= c->ymin);
2075         assert(ref_my <= c->ymax);
2076
2077         ref_score= s->m.me.sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2078         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2079         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2080         if(s->ref_mvs[ref]){
2081             s->ref_mvs[ref][index][0]= ref_mx;
2082             s->ref_mvs[ref][index][1]= ref_my;
2083             s->ref_scores[ref][index]= ref_score;
2084         }
2085         if(score > ref_score){
2086             score= ref_score;
2087             best_ref= ref;
2088             mx= ref_mx;
2089             my= ref_my;
2090         }
2091     }
2092     //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2093
2094   //  subpel search
2095     pc= s->c;
2096     pc.bytestream_start=
2097     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2098     memcpy(p_state, s->block_state, sizeof(s->block_state));
2099
2100     if(level!=s->block_max_depth)
2101         put_rac(&pc, &p_state[4 + s_context], 1);
2102     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2103     if(s->ref_frames > 1)
2104         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2105     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2106     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2107     p_len= pc.bytestream - pc.bytestream_start;
2108     score += (s->lambda2*(p_len*8
2109               + (pc.outstanding_count - s->c.outstanding_count)*8
2110               + (-av_log2(pc.range)    + av_log2(s->c.range))
2111              ))>>FF_LAMBDA_SHIFT;
2112
2113     block_s= block_w*block_w;
2114     sum = pix_sum(current_data[0], stride, block_w);
2115     l= (sum + block_s/2)/block_s;
2116     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2117
2118     block_s= block_w*block_w>>2;
2119     sum = pix_sum(current_data[1], uvstride, block_w>>1);
2120     cb= (sum + block_s/2)/block_s;
2121 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2122     sum = pix_sum(current_data[2], uvstride, block_w>>1);
2123     cr= (sum + block_s/2)/block_s;
2124 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2125
2126     ic= s->c;
2127     ic.bytestream_start=
2128     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2129     memcpy(i_state, s->block_state, sizeof(s->block_state));
2130     if(level!=s->block_max_depth)
2131         put_rac(&ic, &i_state[4 + s_context], 1);
2132     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2133     put_symbol(&ic, &i_state[32],  l-pl , 1);
2134     put_symbol(&ic, &i_state[64], cb-pcb, 1);
2135     put_symbol(&ic, &i_state[96], cr-pcr, 1);
2136     i_len= ic.bytestream - ic.bytestream_start;
2137     iscore += (s->lambda2*(i_len*8
2138               + (ic.outstanding_count - s->c.outstanding_count)*8
2139               + (-av_log2(ic.range)    + av_log2(s->c.range))
2140              ))>>FF_LAMBDA_SHIFT;
2141
2142 //    assert(score==256*256*256*64-1);
2143     assert(iscore < 255*255*256 + s->lambda2*10);
2144     assert(iscore >= 0);
2145     assert(l>=0 && l<=255);
2146     assert(pl>=0 && pl<=255);
2147
2148     if(level==0){
2149         int varc= iscore >> 8;
2150         int vard= score >> 8;
2151         if (vard <= 64 || vard < varc)
2152             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2153         else
2154             c->scene_change_score+= s->m.qscale;
2155     }
2156
2157     if(level!=s->block_max_depth){
2158         put_rac(&s->c, &s->block_state[4 + s_context], 0);
2159         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2160         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2161         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2162         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2163         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2164
2165         if(score2 < score && score2 < iscore)
2166             return score2;
2167     }
2168
2169     if(iscore < score){
2170         memcpy(pbbak, i_buffer, i_len);
2171         s->c= ic;
2172         s->c.bytestream_start= pbbak_start;
2173         s->c.bytestream= pbbak + i_len;
2174         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2175         memcpy(s->block_state, i_state, sizeof(s->block_state));
2176         return iscore;
2177     }else{
2178         memcpy(pbbak, p_buffer, p_len);
2179         s->c= pc;
2180         s->c.bytestream_start= pbbak_start;
2181         s->c.bytestream= pbbak + p_len;
2182         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2183         memcpy(s->block_state, p_state, sizeof(s->block_state));
2184         return score;
2185     }
2186 }
2187
2188 static always_inline int same_block(BlockNode *a, BlockNode *b){
2189     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2190         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2191     }else{
2192         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2193     }
2194 }
2195
2196 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2197     const int w= s->b_width  << s->block_max_depth;
2198     const int rem_depth= s->block_max_depth - level;
2199     const int index= (x + y*w) << rem_depth;
2200     int trx= (x+1)<<rem_depth;
2201     BlockNode *b= &s->block[index];
2202     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2203     BlockNode *top   = y ? &s->block[index-w] : &null_block;
2204     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2205     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2206     int pl = left->color[0];
2207     int pcb= left->color[1];
2208     int pcr= left->color[2];
2209     int pmx= mid_pred(left->mx, top->mx, tr->mx);
2210     int pmy= mid_pred(left->my, top->my, tr->my);
2211     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2212     int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 16*!!b->ref;
2213     int my_context= av_log2(2*ABS(left->my - top->my)) + 16*!!b->ref;
2214     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2215
2216     if(s->keyframe){
2217         set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, 0, BLOCK_INTRA);
2218         return;
2219     }
2220
2221     if(level!=s->block_max_depth){
2222         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2223             put_rac(&s->c, &s->block_state[4 + s_context], 1);
2224         }else{
2225             put_rac(&s->c, &s->block_state[4 + s_context], 0);
2226             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2227             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2228             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2229             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2230             return;
2231         }
2232     }
2233     if(b->type & BLOCK_INTRA){
2234         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2235         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2236         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2237         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2238         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2239     }else{
2240         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2241         if(s->ref_frames > 1)
2242             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2243         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2244         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2245         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2246     }
2247 }
2248
2249 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2250     const int w= s->b_width << s->block_max_depth;
2251     const int rem_depth= s->block_max_depth - level;
2252     const int index= (x + y*w) << rem_depth;
2253     int trx= (x+1)<<rem_depth;
2254     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2255     BlockNode *top   = y ? &s->block[index-w] : &null_block;
2256     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2257     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2258     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2259
2260     if(s->keyframe){
2261         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2262         return;
2263     }
2264
2265     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2266         int type;
2267         int l = left->color[0];
2268         int cb= left->color[1];
2269         int cr= left->color[2];
2270         int mx= mid_pred(left->mx, top->mx, tr->mx);
2271         int my= mid_pred(left->my, top->my, tr->my);
2272         int ref = 0;
2273         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2274         int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 0*av_log2(2*ABS(tr->mx - top->mx));
2275         int my_context= av_log2(2*ABS(left->my - top->my)) + 0*av_log2(2*ABS(tr->my - top->my));
2276
2277         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2278
2279         if(type){
2280             l += get_symbol(&s->c, &s->block_state[32], 1);
2281             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2282             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2283         }else{
2284             if(s->ref_frames > 1)
2285                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2286             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2287             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2288         }
2289         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2290     }else{
2291         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2292         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2293         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2294         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2295     }
2296 }
2297
2298 static void encode_blocks(SnowContext *s){
2299     int x, y;
2300     int w= s->b_width;
2301     int h= s->b_height;
2302
2303     if(s->avctx->me_method == ME_ITER && !s->keyframe)
2304         iterative_me(s);
2305
2306     for(y=0; y<h; y++){
2307         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2308             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2309             return;
2310         }
2311         for(x=0; x<w; x++){
2312             if(s->avctx->me_method == ME_ITER)
2313                 encode_q_branch2(s, 0, x, y);
2314             else
2315                 encode_q_branch (s, 0, x, y);
2316         }
2317     }
2318 }
2319
2320 static void decode_blocks(SnowContext *s){
2321     int x, y;
2322     int w= s->b_width;
2323     int h= s->b_height;
2324
2325     for(y=0; y<h; y++){
2326         for(x=0; x<w; x++){
2327             decode_q_branch(s, 0, x, y);
2328         }
2329     }
2330 }
2331
2332 static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2333     int x, y;
2334 START_TIMER
2335     for(y=0; y < b_h+5; y++){
2336         for(x=0; x < b_w; x++){
2337             int a0= src[x    ];
2338             int a1= src[x + 1];
2339             int a2= src[x + 2];
2340             int a3= src[x + 3];
2341             int a4= src[x + 4];
2342             int a5= src[x + 5];
2343 //            int am= 9*(a1+a2) - (a0+a3);
2344             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2345 //            int am= 18*(a2+a3) - 2*(a1+a4);
2346 //             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2347 //             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2348
2349 //            if(b_w==16) am= 8*(a1+a2);
2350
2351             if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2352             else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2353
2354             /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2355             if(am&(~255)) am= ~(am>>31);
2356
2357             tmp[x] = am;
2358
2359 /*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2360             else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2361             else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2362             else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2363         }
2364         tmp += stride;
2365         src += stride;
2366     }
2367     tmp -= (b_h+5)*stride;
2368
2369     for(y=0; y < b_h; y++){
2370         for(x=0; x < b_w; x++){
2371             int a0= tmp[x + 0*stride];
2372             int a1= tmp[x + 1*stride];
2373             int a2= tmp[x + 2*stride];
2374             int a3= tmp[x + 3*stride];
2375             int a4= tmp[x + 4*stride];
2376             int a5= tmp[x + 5*stride];
2377             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2378 //            int am= 18*(a2+a3) - 2*(a1+a4);
2379 /*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2380             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2381
2382 //            if(b_w==16) am= 8*(a1+a2);
2383
2384             if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2385             else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2386
2387             if(am&(~255)) am= ~(am>>31);
2388
2389             dst[x] = am;
2390 /*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2391             else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2392             else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2393             else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2394         }
2395         dst += stride;
2396         tmp += stride;
2397     }
2398 STOP_TIMER("mc_block")
2399 }
2400
2401 #define mca(dx,dy,b_w)\
2402 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2403     uint8_t tmp[stride*(b_w+5)];\
2404     assert(h==b_w);\
2405     mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2406 }
2407
2408 mca( 0, 0,16)
2409 mca( 8, 0,16)
2410 mca( 0, 8,16)
2411 mca( 8, 8,16)
2412 mca( 0, 0,8)
2413 mca( 8, 0,8)
2414 mca( 0, 8,8)
2415 mca( 8, 8,8)
2416
2417 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2418     if(block->type & BLOCK_INTRA){
2419         int x, y;
2420         const int color = block->color[plane_index];
2421         const int color4= color*0x01010101;
2422         if(b_w==32){
2423             for(y=0; y < b_h; y++){
2424                 *(uint32_t*)&dst[0 + y*stride]= color4;
2425                 *(uint32_t*)&dst[4 + y*stride]= color4;
2426                 *(uint32_t*)&dst[8 + y*stride]= color4;
2427                 *(uint32_t*)&dst[12+ y*stride]= color4;
2428                 *(uint32_t*)&dst[16+ y*stride]= color4;
2429                 *(uint32_t*)&dst[20+ y*stride]= color4;
2430                 *(uint32_t*)&dst[24+ y*stride]= color4;
2431                 *(uint32_t*)&dst[28+ y*stride]= color4;
2432             }
2433         }else if(b_w==16){
2434             for(y=0; y < b_h; y++){
2435                 *(uint32_t*)&dst[0 + y*stride]= color4;
2436                 *(uint32_t*)&dst[4 + y*stride]= color4;
2437                 *(uint32_t*)&dst[8 + y*stride]= color4;
2438                 *(uint32_t*)&dst[12+ y*stride]= color4;
2439             }
2440         }else if(b_w==8){
2441             for(y=0; y < b_h; y++){
2442                 *(uint32_t*)&dst[0 + y*stride]= color4;
2443                 *(uint32_t*)&dst[4 + y*stride]= color4;
2444             }
2445         }else if(b_w==4){
2446             for(y=0; y < b_h; y++){
2447                 *(uint32_t*)&dst[0 + y*stride]= color4;
2448             }
2449         }else{
2450             for(y=0; y < b_h; y++){
2451                 for(x=0; x < b_w; x++){
2452                     dst[x + y*stride]= color;
2453                 }
2454             }
2455         }
2456     }else{
2457         uint8_t *src= s->last_picture[block->ref].data[plane_index];
2458         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2459         int mx= block->mx*scale;
2460         int my= block->my*scale;
2461         const int dx= mx&15;
2462         const int dy= my&15;
2463         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2464         sx += (mx>>4) - 2;
2465         sy += (my>>4) - 2;
2466         src += sx + sy*stride;
2467         if(   (unsigned)sx >= w - b_w - 4
2468            || (unsigned)sy >= h - b_h - 4){
2469             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2470             src= tmp + MB_SIZE;
2471         }
2472 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2473 //        assert(!(b_w&(b_w-1)));
2474         assert(b_w>1 && b_h>1);
2475         assert(tab_index>=0 && tab_index<4 || b_w==32);
2476         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2477             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2478         else if(b_w==32){
2479             int y;
2480             for(y=0; y<b_h; y+=16){
2481                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2482                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2483             }
2484         }else if(b_w==b_h)
2485             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2486         else if(b_w==2*b_h){
2487             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2488             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2489         }else{
2490             assert(2*b_w==b_h);
2491             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2492             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2493         }
2494     }
2495 }
2496
2497 void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2498                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2499     int y, x;
2500     DWTELEM * dst;
2501     for(y=0; y<b_h; y++){
2502         //FIXME ugly missue of obmc_stride
2503         uint8_t *obmc1= obmc + y*obmc_stride;
2504         uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2505         uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2506         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2507         dst = slice_buffer_get_line(sb, src_y + y);
2508         for(x=0; x<b_w; x++){
2509             int v=   obmc1[x] * block[3][x + y*src_stride]
2510                     +obmc2[x] * block[2][x + y*src_stride]
2511                     +obmc3[x] * block[1][x + y*src_stride]
2512                     +obmc4[x] * block[0][x + y*src_stride];
2513
2514             v <<= 8 - LOG2_OBMC_MAX;
2515             if(FRAC_BITS != 8){
2516                 v += 1<<(7 - FRAC_BITS);
2517                 v >>= 8 - FRAC_BITS;
2518             }
2519             if(add){
2520                 v += dst[x + src_x];
2521                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2522                 if(v&(~255)) v= ~(v>>31);
2523                 dst8[x + y*src_stride] = v;
2524             }else{
2525                 dst[x + src_x] -= v;
2526             }
2527         }
2528     }
2529 }
2530
2531 //FIXME name clenup (b_w, block_w, b_width stuff)
2532 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
2533     DWTELEM * dst = NULL;
2534     const int b_width = s->b_width  << s->block_max_depth;
2535     const int b_height= s->b_height << s->block_max_depth;
2536     const int b_stride= b_width;
2537     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2538     BlockNode *rt= lt+1;
2539     BlockNode *lb= lt+b_stride;
2540     BlockNode *rb= lb+1;
2541     uint8_t *block[4];
2542     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2543     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2544     uint8_t *ptmp;
2545     int x,y;
2546
2547     if(b_x<0){
2548         lt= rt;
2549         lb= rb;
2550     }else if(b_x + 1 >= b_width){
2551         rt= lt;
2552         rb= lb;
2553     }
2554     if(b_y<0){
2555         lt= lb;
2556         rt= rb;
2557     }else if(b_y + 1 >= b_height){
2558         lb= lt;
2559         rb= rt;
2560     }
2561
2562     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2563         obmc -= src_x;
2564         b_w += src_x;
2565         src_x=0;
2566     }else if(src_x + b_w > w){
2567         b_w = w - src_x;
2568     }
2569     if(src_y<0){
2570         obmc -= src_y*obmc_stride;
2571         b_h += src_y;
2572         src_y=0;
2573     }else if(src_y + b_h> h){
2574         b_h = h - src_y;
2575     }
2576
2577     if(b_w<=0 || b_h<=0) return;
2578
2579 assert(src_stride > 2*MB_SIZE + 5);
2580 //    old_dst += src_x + src_y*dst_stride;
2581     dst8+= src_x + src_y*src_stride;
2582 //    src += src_x + src_y*src_stride;
2583
2584     ptmp= tmp + 3*tmp_step;
2585     block[0]= ptmp;
2586     ptmp+=tmp_step;
2587     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2588
2589     if(same_block(lt, rt)){
2590         block[1]= block[0];
2591     }else{
2592         block[1]= ptmp;
2593         ptmp+=tmp_step;
2594         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2595     }
2596
2597     if(same_block(lt, lb)){
2598         block[2]= block[0];
2599     }else if(same_block(rt, lb)){
2600         block[2]= block[1];
2601     }else{
2602         block[2]= ptmp;
2603         ptmp+=tmp_step;
2604         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2605     }
2606
2607     if(same_block(lt, rb) ){
2608         block[3]= block[0];
2609     }else if(same_block(rt, rb)){
2610         block[3]= block[1];
2611     }else if(same_block(lb, rb)){
2612         block[3]= block[2];
2613     }else{
2614         block[3]= ptmp;
2615         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2616     }
2617 #if 0
2618     for(y=0; y<b_h; y++){
2619         for(x=0; x<b_w; x++){
2620             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2621             if(add) dst[x + y*dst_stride] += v;
2622             else    dst[x + y*dst_stride] -= v;
2623         }
2624     }
2625     for(y=0; y<b_h; y++){
2626         uint8_t *obmc2= obmc + (obmc_stride>>1);
2627         for(x=0; x<b_w; x++){
2628             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2629             if(add) dst[x + y*dst_stride] += v;
2630             else    dst[x + y*dst_stride] -= v;
2631         }
2632     }
2633     for(y=0; y<b_h; y++){
2634         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2635         for(x=0; x<b_w; x++){
2636             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2637             if(add) dst[x + y*dst_stride] += v;
2638             else    dst[x + y*dst_stride] -= v;
2639         }
2640     }
2641     for(y=0; y<b_h; y++){
2642         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2643         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2644         for(x=0; x<b_w; x++){
2645             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2646             if(add) dst[x + y*dst_stride] += v;
2647             else    dst[x + y*dst_stride] -= v;
2648         }
2649     }
2650 #else
2651 {
2652
2653     START_TIMER
2654
2655     s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2656         STOP_TIMER("Inner add y block")
2657 }
2658 #endif
2659 }
2660
2661 //FIXME name clenup (b_w, block_w, b_width stuff)
2662 static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2663     const int b_width = s->b_width  << s->block_max_depth;
2664     const int b_height= s->b_height << s->block_max_depth;
2665     const int b_stride= b_width;
2666     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2667     BlockNode *rt= lt+1;
2668     BlockNode *lb= lt+b_stride;
2669     BlockNode *rb= lb+1;
2670     uint8_t *block[4];
2671     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2672     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2673     uint8_t *ptmp;
2674     int x,y;
2675
2676     if(b_x<0){
2677         lt= rt;
2678         lb= rb;
2679     }else if(b_x + 1 >= b_width){
2680         rt= lt;
2681         rb= lb;
2682     }
2683     if(b_y<0){
2684         lt= lb;
2685         rt= rb;
2686     }else if(b_y + 1 >= b_height){
2687         lb= lt;
2688         rb= rt;
2689     }
2690
2691     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2692         obmc -= src_x;
2693         b_w += src_x;
2694         if(!offset_dst)
2695             dst -= src_x;
2696         src_x=0;
2697     }else if(src_x + b_w > w){
2698         b_w = w - src_x;
2699     }
2700     if(src_y<0){
2701         obmc -= src_y*obmc_stride;
2702         b_h += src_y;
2703         if(!offset_dst)
2704             dst -= src_y*dst_stride;
2705         src_y=0;
2706     }else if(src_y + b_h> h){
2707         b_h = h - src_y;
2708     }
2709
2710     if(b_w<=0 || b_h<=0) return;
2711
2712 assert(src_stride > 2*MB_SIZE + 5);
2713     if(offset_dst)
2714         dst += src_x + src_y*dst_stride;
2715     dst8+= src_x + src_y*src_stride;
2716 //    src += src_x + src_y*src_stride;
2717
2718     ptmp= tmp + 3*tmp_step;
2719     block[0]= ptmp;
2720     ptmp+=tmp_step;
2721     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2722
2723     if(same_block(lt, rt)){
2724         block[1]= block[0];
2725     }else{
2726         block[1]= ptmp;
2727         ptmp+=tmp_step;
2728         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2729     }
2730
2731     if(same_block(lt, lb)){
2732         block[2]= block[0];
2733     }else if(same_block(rt, lb)){
2734         block[2]= block[1];
2735     }else{
2736         block[2]= ptmp;
2737         ptmp+=tmp_step;
2738         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2739     }
2740
2741     if(same_block(lt, rb) ){
2742         block[3]= block[0];
2743     }else if(same_block(rt, rb)){
2744         block[3]= block[1];
2745     }else if(same_block(lb, rb)){
2746         block[3]= block[2];
2747     }else{
2748         block[3]= ptmp;
2749         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2750     }
2751 #if 0
2752     for(y=0; y<b_h; y++){
2753         for(x=0; x<b_w; x++){
2754             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2755             if(add) dst[x + y*dst_stride] += v;
2756             else    dst[x + y*dst_stride] -= v;
2757         }
2758     }
2759     for(y=0; y<b_h; y++){
2760         uint8_t *obmc2= obmc + (obmc_stride>>1);
2761         for(x=0; x<b_w; x++){
2762             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2763             if(add) dst[x + y*dst_stride] += v;
2764             else    dst[x + y*dst_stride] -= v;
2765         }
2766     }
2767     for(y=0; y<b_h; y++){
2768         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2769         for(x=0; x<b_w; x++){
2770             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2771             if(add) dst[x + y*dst_stride] += v;
2772             else    dst[x + y*dst_stride] -= v;
2773         }
2774     }
2775     for(y=0; y<b_h; y++){
2776         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2777         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2778         for(x=0; x<b_w; x++){
2779             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2780             if(add) dst[x + y*dst_stride] += v;
2781             else    dst[x + y*dst_stride] -= v;
2782         }
2783     }
2784 #else
2785     for(y=0; y<b_h; y++){
2786         //FIXME ugly missue of obmc_stride
2787         uint8_t *obmc1= obmc + y*obmc_stride;
2788         uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2789         uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2790         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2791         for(x=0; x<b_w; x++){
2792             int v=   obmc1[x] * block[3][x + y*src_stride]
2793                     +obmc2[x] * block[2][x + y*src_stride]
2794                     +obmc3[x] * block[1][x + y*src_stride]
2795                     +obmc4[x] * block[0][x + y*src_stride];
2796
2797             v <<= 8 - LOG2_OBMC_MAX;
2798             if(FRAC_BITS != 8){
2799                 v += 1<<(7 - FRAC_BITS);
2800                 v >>= 8 - FRAC_BITS;
2801             }
2802             if(add){
2803                 v += dst[x + y*dst_stride];
2804                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2805                 if(v&(~255)) v= ~(v>>31);
2806                 dst8[x + y*src_stride] = v;
2807             }else{
2808                 dst[x + y*dst_stride] -= v;
2809             }
2810         }
2811     }
2812 #endif
2813 }
2814
2815 static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2816     Plane *p= &s->plane[plane_index];
2817     const int mb_w= s->b_width  << s->block_max_depth;
2818     const int mb_h= s->b_height << s->block_max_depth;
2819     int x, y, mb_x;
2820     int block_size = MB_SIZE >> s->block_max_depth;
2821     int block_w    = plane_index ? block_size/2 : block_size;
2822     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2823     int obmc_stride= plane_index ? block_size : 2*block_size;
2824     int ref_stride= s->current_picture.linesize[plane_index];
2825     uint8_t *dst8= s->current_picture.data[plane_index];
2826     int w= p->width;
2827     int h= p->height;
2828     START_TIMER
2829
2830     if(s->keyframe || (s->avctx->debug&512)){
2831         if(mb_y==mb_h)
2832             return;
2833
2834         if(add){
2835             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2836             {
2837 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2838                 DWTELEM * line = sb->line[y];
2839                 for(x=0; x<w; x++)
2840                 {
2841 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2842                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2843                     v >>= FRAC_BITS;
2844                     if(v&(~255)) v= ~(v>>31);
2845                     dst8[x + y*ref_stride]= v;
2846                 }
2847             }
2848         }else{
2849             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2850             {
2851 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2852                 DWTELEM * line = sb->line[y];
2853                 for(x=0; x<w; x++)
2854                 {
2855                     line[x] -= 128 << FRAC_BITS;
2856 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2857                 }
2858             }
2859         }
2860
2861         return;
2862     }
2863
2864         for(mb_x=0; mb_x<=mb_w; mb_x++){
2865             START_TIMER
2866
2867             add_yblock_buffered(s, sb, old_buffer, dst8, obmc,
2868                        block_w*mb_x - block_w/2,
2869                        block_w*mb_y - block_w/2,
2870                        block_w, block_w,
2871                        w, h,
2872                        w, ref_stride, obmc_stride,
2873                        mb_x - 1, mb_y - 1,
2874                        add, plane_index);
2875
2876             STOP_TIMER("add_yblock")
2877         }
2878
2879     STOP_TIMER("predict_slice")
2880 }
2881
2882 static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2883     Plane *p= &s->plane[plane_index];
2884     const int mb_w= s->b_width  << s->block_max_depth;
2885     const int mb_h= s->b_height << s->block_max_depth;
2886     int x, y, mb_x;
2887     int block_size = MB_SIZE >> s->block_max_depth;
2888     int block_w    = plane_index ? block_size/2 : block_size;
2889     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2890     const int obmc_stride= plane_index ? block_size : 2*block_size;
2891     int ref_stride= s->current_picture.linesize[plane_index];
2892     uint8_t *dst8= s->current_picture.data[plane_index];
2893     int w= p->width;
2894     int h= p->height;
2895     START_TIMER
2896
2897     if(s->keyframe || (s->avctx->debug&512)){
2898         if(mb_y==mb_h)
2899             return;
2900
2901         if(add){
2902             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2903                 for(x=0; x<w; x++){
2904                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2905                     v >>= FRAC_BITS;
2906                     if(v&(~255)) v= ~(v>>31);
2907                     dst8[x + y*ref_stride]= v;
2908                 }
2909             }
2910         }else{
2911             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2912                 for(x=0; x<w; x++){
2913                     buf[x + y*w]-= 128<<FRAC_BITS;
2914                 }
2915             }
2916         }
2917
2918         return;
2919     }
2920
2921         for(mb_x=0; mb_x<=mb_w; mb_x++){
2922             START_TIMER
2923
2924             add_yblock(s, buf, dst8, obmc,
2925                        block_w*mb_x - block_w/2,
2926                        block_w*mb_y - block_w/2,
2927                        block_w, block_w,
2928                        w, h,
2929                        w, ref_stride, obmc_stride,
2930                        mb_x - 1, mb_y - 1,
2931                        add, 1, plane_index);
2932
2933             STOP_TIMER("add_yblock")
2934         }
2935
2936     STOP_TIMER("predict_slice")
2937 }
2938
2939 static always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2940     const int mb_h= s->b_height << s->block_max_depth;
2941     int mb_y;
2942     for(mb_y=0; mb_y<=mb_h; mb_y++)
2943         predict_slice(s, buf, plane_index, add, mb_y);
2944 }
2945
2946 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2947     int i, x2, y2;
2948     Plane *p= &s->plane[plane_index];
2949     const int block_size = MB_SIZE >> s->block_max_depth;
2950     const int block_w    = plane_index ? block_size/2 : block_size;
2951     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2952     const int obmc_stride= plane_index ? block_size : 2*block_size;
2953     const int ref_stride= s->current_picture.linesize[plane_index];
2954     uint8_t *src= s-> input_picture.data[plane_index];
2955     DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2956     const int b_stride = s->b_width << s->block_max_depth;
2957     const int w= p->width;
2958     const int h= p->height;
2959     int index= mb_x + mb_y*b_stride;
2960     BlockNode *b= &s->block[index];
2961     BlockNode backup= *b;
2962     int ab=0;
2963     int aa=0;
2964
2965     b->type|= BLOCK_INTRA;
2966     b->color[plane_index]= 0;
2967     memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2968
2969     for(i=0; i<4; i++){
2970         int mb_x2= mb_x + (i &1) - 1;
2971         int mb_y2= mb_y + (i>>1) - 1;
2972         int x= block_w*mb_x2 + block_w/2;
2973         int y= block_w*mb_y2 + block_w/2;
2974
2975         add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2976                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2977
2978         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2979             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2980                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2981                 int obmc_v= obmc[index];
2982                 int d;
2983                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2984                 if(x<0) obmc_v += obmc[index + block_w];
2985                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2986                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2987                 //FIXME precalc this or simplify it somehow else
2988
2989                 d = -dst[index] + (1<<(FRAC_BITS-1));
2990                 dst[index] = d;
2991                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2992                 aa += obmc_v * obmc_v; //FIXME precalclate this
2993             }
2994         }
2995     }
2996     *b= backup;
2997
2998     return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2999 }
3000
3001 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
3002     const int b_stride = s->b_width << s->block_max_depth;
3003     const int b_height = s->b_height<< s->block_max_depth;
3004     int index= x + y*b_stride;
3005     BlockNode *b     = &s->block[index];
3006     BlockNode *left  = x ? &s->block[index-1] : &null_block;
3007     BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
3008     BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
3009     BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
3010     int dmx, dmy;
3011 //  int mx_context= av_log2(2*ABS(left->mx - top->mx));
3012 //  int my_context= av_log2(2*ABS(left->my - top->my));
3013
3014     if(x<0 || x>=b_stride || y>=b_height)
3015         return 0;
3016     dmx= b->mx - mid_pred(left->mx, top->mx, tr->mx);
3017     dmy= b->my - mid_pred(left->my, top->my, tr->my);
3018 /*
3019 1            0      0
3020 01X          1-2    1
3021 001XX        3-6    2-3
3022 0001XXX      7-14   4-7
3023 00001XXXX   15-30   8-15
3024 */
3025 //FIXME try accurate rate
3026 //FIXME intra and inter predictors if surrounding blocks arent the same type
3027     if(b->type & BLOCK_INTRA){
3028         return 3+2*( av_log2(2*ABS(left->color[0] - b->color[0]))
3029                    + av_log2(2*ABS(left->color[1] - b->color[1]))
3030                    + av_log2(2*ABS(left->color[2] - b->color[2])));
3031     }else
3032         return 2*(1 + av_log2(2*ABS(dmx)) //FIXME kill the 2* can be merged in lambda
3033                     + av_log2(2*ABS(dmy))
3034                     + av_log2(2*b->ref));
3035 }
3036
3037 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
3038     Plane *p= &s->plane[plane_index];
3039     const int block_size = MB_SIZE >> s->block_max_depth;
3040     const int block_w    = plane_index ? block_size/2 : block_size;
3041     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3042     const int obmc_stride= plane_index ? block_size : 2*block_size;
3043     const int ref_stride= s->current_picture.linesize[plane_index];
3044     uint8_t *dst= s->current_picture.data[plane_index];
3045     uint8_t *src= s->  input_picture.data[plane_index];
3046     DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
3047     uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
3048     uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
3049     const int b_stride = s->b_width << s->block_max_depth;
3050     const int b_height = s->b_height<< s->block_max_depth;
3051     const int w= p->width;
3052     const int h= p->height;
3053     int distortion;
3054     int rate= 0;
3055     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3056     int sx= block_w*mb_x - block_w/2;
3057     int sy= block_w*mb_y - block_w/2;
3058     int x0= FFMAX(0,-sx);
3059     int y0= FFMAX(0,-sy);
3060     int x1= FFMIN(block_w*2, w-sx);
3061     int y1= FFMIN(block_w*2, h-sy);
3062     int i,x,y;
3063
3064     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
3065
3066     for(y=y0; y<y1; y++){
3067         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
3068         const DWTELEM *pred1 = pred + y*obmc_stride;
3069         uint8_t *cur1 = cur + y*ref_stride;
3070         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
3071         for(x=x0; x<x1; x++){
3072             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
3073             v = (v + pred1[x]) >> FRAC_BITS;
3074             if(v&(~255)) v= ~(v>>31);
3075             dst1[x] = v;
3076         }
3077     }
3078
3079     /* copy the regions where obmc[] = (uint8_t)256 */
3080     if(LOG2_OBMC_MAX == 8
3081         && (mb_x == 0 || mb_x == b_stride-1)
3082         && (mb_y == 0 || mb_y == b_height-1)){
3083         if(mb_x == 0)
3084             x1 = block_w;
3085         else
3086             x0 = block_w;
3087         if(mb_y == 0)
3088             y1 = block_w;
3089         else
3090             y0 = block_w;
3091         for(y=y0; y<y1; y++)
3092             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
3093     }
3094
3095     if(block_w==16){
3096         /* FIXME rearrange dsputil to fit 32x32 cmp functions */
3097         /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
3098         /* FIXME cmps overlap but don't cover the wavelet's whole support,
3099          * so improving the score of one block is not strictly guaranteed to
3100          * improve the score of the whole frame, so iterative motion est
3101          * doesn't always converge. */
3102         if(s->avctx->me_cmp == FF_CMP_W97)
3103             distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3104         else if(s->avctx->me_cmp == FF_CMP_W53)
3105             distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
3106         else{
3107             distortion = 0;
3108             for(i=0; i<4; i++){
3109                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3110                 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3111             }
3112         }
3113     }else{
3114         assert(block_w==8);
3115         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3116     }
3117
3118     if(plane_index==0){
3119         for(i=0; i<4; i++){
3120 /* ..RRr
3121  * .RXx.
3122  * rxx..
3123  */
3124             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3125         }
3126         if(mb_x == b_stride-2)
3127             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3128     }
3129     return distortion + rate*penalty_factor;
3130 }
3131
3132 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3133     int i, y2;
3134     Plane *p= &s->plane[plane_index];
3135     const int block_size = MB_SIZE >> s->block_max_depth;
3136     const int block_w    = plane_index ? block_size/2 : block_size;
3137     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3138     const int obmc_stride= plane_index ? block_size : 2*block_size;
3139     const int ref_stride= s->current_picture.linesize[plane_index];
3140     uint8_t *dst= s->current_picture.data[plane_index];
3141     uint8_t *src= s-> input_picture.data[plane_index];
3142     const static DWTELEM zero_dst[4096]; //FIXME
3143     const int b_stride = s->b_width << s->block_max_depth;
3144     const int b_height = s->b_height<< s->block_max_depth;
3145     const int w= p->width;
3146     const int h= p->height;
3147     int distortion= 0;
3148     int rate= 0;
3149     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3150
3151     for(i=0; i<9; i++){
3152         int mb_x2= mb_x + (i%3) - 1;
3153         int mb_y2= mb_y + (i/3) - 1;
3154         int x= block_w*mb_x2 + block_w/2;
3155         int y= block_w*mb_y2 + block_w/2;
3156
3157         add_yblock(s, zero_dst, dst, obmc,
3158                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3159
3160         //FIXME find a cleaner/simpler way to skip the outside stuff
3161         for(y2= y; y2<0; y2++)
3162             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3163         for(y2= h; y2<y+block_w; y2++)
3164             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3165         if(x<0){
3166             for(y2= y; y2<y+block_w; y2++)
3167                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3168         }
3169         if(x+block_w > w){
3170             for(y2= y; y2<y+block_w; y2++)
3171                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3172         }
3173
3174         assert(block_w== 8 || block_w==16);
3175         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3176     }
3177
3178     if(plane_index==0){
3179         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3180         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3181
3182 /* ..RRRr
3183  * .RXXx.
3184  * .RXXx.
3185  * rxxx.
3186  */
3187         if(merged)
3188             rate = get_block_bits(s, mb_x, mb_y, 2);
3189         for(i=merged?4:0; i<9; i++){
3190             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3191             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3192         }
3193     }
3194     return distortion + rate*penalty_factor;
3195 }
3196
3197 static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3198     const int b_stride= s->b_width << s->block_max_depth;
3199     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3200     BlockNode backup= *block;
3201     int rd, index, value;
3202
3203     assert(mb_x>=0 && mb_y>=0);
3204     assert(mb_x<b_stride);
3205
3206     if(intra){
3207         block->color[0] = p[0];
3208         block->color[1] = p[1];
3209         block->color[2] = p[2];
3210         block->type |= BLOCK_INTRA;
3211     }else{
3212         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3213         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3214         if(s->me_cache[index] == value)
3215             return 0;
3216         s->me_cache[index]= value;
3217
3218         block->mx= p[0];
3219         block->my= p[1];
3220         block->type &= ~BLOCK_INTRA;
3221     }
3222
3223     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3224
3225 //FIXME chroma
3226     if(rd < *best_rd){
3227         *best_rd= rd;
3228         return 1;
3229     }else{
3230         *block= backup;
3231         return 0;
3232     }
3233 }
3234
3235 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3236 static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3237     int p[2] = {p0, p1};
3238     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3239 }
3240
3241 static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3242     const int b_stride= s->b_width << s->block_max_depth;
3243     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3244     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3245     int rd, index, value;
3246
3247     assert(mb_x>=0 && mb_y>=0);
3248     assert(mb_x<b_stride);
3249     assert(((mb_x|mb_y)&1) == 0);
3250
3251     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3252     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3253     if(s->me_cache[index] == value)
3254         return 0;
3255     s->me_cache[index]= value;
3256
3257     block->mx= p0;
3258     block->my= p1;
3259     block->ref= ref;
3260     block->type &= ~BLOCK_INTRA;
3261     block[1]= block[b_stride]= block[b_stride+1]= *block;
3262
3263     rd= get_4block_rd(s, mb_x, mb_y, 0);
3264
3265 //FIXME chroma
3266     if(rd < *best_rd){
3267         *best_rd= rd;
3268         return 1;
3269     }else{
3270         block[0]= backup[0];
3271         block[1]= backup[1];
3272         block[b_stride]= backup[2];
3273         block[b_stride+1]= backup[3];
3274         return 0;
3275     }
3276 }
3277
3278 static void iterative_me(SnowContext *s){
3279     int pass, mb_x, mb_y;
3280     const int b_width = s->b_width  << s->block_max_depth;
3281     const int b_height= s->b_height << s->block_max_depth;
3282     const int b_stride= b_width;
3283     int color[3];
3284
3285     {
3286         RangeCoder r = s->c;
3287         uint8_t state[sizeof(s->block_state)];
3288         memcpy(state, s->block_state, sizeof(s->block_state));
3289         for(mb_y= 0; mb_y<s->b_height; mb_y++)
3290             for(mb_x= 0; mb_x<s->b_width; mb_x++)
3291                 encode_q_branch(s, 0, mb_x, mb_y);
3292         s->c = r;
3293         memcpy(s->block_state, state, sizeof(s->block_state));
3294     }
3295
3296     for(pass=0; pass<25; pass++){
3297         int change= 0;
3298
3299         for(mb_y= 0; mb_y<b_height; mb_y++){
3300             for(mb_x= 0; mb_x<b_width; mb_x++){
3301                 int dia_change, i, j, ref;
3302                 int best_rd= INT_MAX, ref_rd;
3303                 BlockNode backup, ref_b;
3304                 const int index= mb_x + mb_y * b_stride;
3305                 BlockNode *block= &s->block[index];
3306                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3307                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3308                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3309                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3310                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3311                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3312                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3313                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3314                 const int b_w= (MB_SIZE >> s->block_max_depth);
3315                 uint8_t obmc_edged[b_w*2][b_w*2];
3316
3317                 if(pass && (block->type & BLOCK_OPT))
3318                     continue;
3319                 block->type |= BLOCK_OPT;
3320
3321                 backup= *block;
3322
3323                 if(!s->me_cache_generation)
3324                     memset(s->me_cache, 0, sizeof(s->me_cache));
3325                 s->me_cache_generation += 1<<22;
3326
3327                 //FIXME precalc
3328                 {
3329                     int x, y;
3330                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3331                     if(mb_x==0)
3332                         for(y=0; y<b_w*2; y++)
3333                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3334                     if(mb_x==b_stride-1)
3335                         for(y=0; y<b_w*2; y++)
3336                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3337                     if(mb_y==0){
3338                         for(x=0; x<b_w*2; x++)
3339                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
3340                         for(y=1; y<b_w; y++)
3341                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3342                     }
3343                     if(mb_y==b_height-1){
3344                         for(x=0; x<b_w*2; x++)
3345                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3346                         for(y=b_w; y<b_w*2-1; y++)
3347                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3348                     }
3349                 }
3350
3351                 //skip stuff outside the picture
3352                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3353                 {
3354                     uint8_t *src= s->  input_picture.data[0];
3355                     uint8_t *dst= s->current_picture.data[0];
3356                     const int stride= s->current_picture.linesize[0];
3357                     const int block_w= MB_SIZE >> s->block_max_depth;
3358                     const int sx= block_w*mb_x - block_w/2;
3359                     const int sy= block_w*mb_y - block_w/2;
3360                     const int w= s->plane[0].width;
3361                     const int h= s->plane[0].height;
3362                     int y;
3363
3364                     for(y=sy; y<0; y++)
3365                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3366                     for(y=h; y<sy+block_w*2; y++)
3367                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3368                     if(sx<0){
3369                         for(y=sy; y<sy+block_w*2; y++)
3370                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3371                     }
3372                     if(sx+block_w*2 > w){
3373                         for(y=sy; y<sy+block_w*2; y++)
3374                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3375                     }
3376                 }
3377
3378                 // intra(black) = neighbors' contribution to the current block
3379                 for(i=0; i<3; i++)
3380                     color[i]= get_dc(s, mb_x, mb_y, i);
3381
3382                 // get previous score (cant be cached due to OBMC)
3383                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3384                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3385                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3386                 }else
3387                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3388
3389                 ref_b= *block;
3390                 ref_rd= best_rd;
3391                 for(ref=0; ref < s->ref_frames; ref++){
3392                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3393                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3394                         continue;
3395                     block->ref= ref;
3396                     best_rd= INT_MAX;
3397
3398                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3399                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3400                     if(tb)
3401                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3402                     if(lb)
3403                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3404                     if(rb)
3405                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3406                     if(bb)
3407                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3408
3409                     /* fullpel ME */
3410                     //FIXME avoid subpel interpol / round to nearest integer
3411                     do{
3412                         dia_change=0;
3413                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3414                             for(j=0; j<i; j++){
3415                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3416                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3417                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3418                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3419                             }
3420                         }
3421                     }while(dia_change);
3422                     /* subpel ME */
3423                     do{
3424                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3425                         dia_change=0;
3426                         for(i=0; i<8; i++)
3427                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3428                     }while(dia_change);
3429                     //FIXME or try the standard 2 pass qpel or similar
3430
3431                     mvr[0][0]= block->mx;
3432                     mvr[0][1]= block->my;
3433                     if(ref_rd > best_rd){
3434                         ref_rd= best_rd;
3435                         ref_b= *block;
3436                     }
3437                 }
3438                 best_rd= ref_rd;
3439                 *block= ref_b;
3440 #if 1
3441                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3442                 //FIXME RD style color selection
3443 #endif
3444                 if(!same_block(block, &backup)){
3445                     if(tb ) tb ->type &= ~BLOCK_OPT;
3446                     if(lb ) lb ->type &= ~BLOCK_OPT;
3447                     if(rb ) rb ->type &= ~BLOCK_OPT;
3448                     if(bb ) bb ->type &= ~BLOCK_OPT;
3449                     if(tlb) tlb->type &= ~BLOCK_OPT;
3450                     if(trb) trb->type &= ~BLOCK_OPT;
3451                     if(blb) blb->type &= ~BLOCK_OPT;
3452                     if(brb) brb->type &= ~BLOCK_OPT;
3453                     change ++;
3454                 }
3455             }
3456         }
3457         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3458         if(!change)
3459             break;
3460     }
3461
3462     if(s->block_max_depth == 1){
3463         int change= 0;
3464         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3465             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3466                 int i;
3467                 int best_rd, init_rd;
3468                 const int index= mb_x + mb_y * b_stride;
3469                 BlockNode *b[4];
3470
3471                 b[0]= &s->block[index];
3472                 b[1]= b[0]+1;
3473                 b[2]= b[0]+b_stride;
3474                 b[3]= b[2]+1;
3475                 if(same_block(b[0], b[1]) &&
3476                    same_block(b[0], b[2]) &&
3477                    same_block(b[0], b[3]))
3478                     continue;
3479
3480                 if(!s->me_cache_generation)
3481                     memset(s->me_cache, 0, sizeof(s->me_cache));
3482                 s->me_cache_generation += 1<<22;
3483
3484                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3485
3486                 //FIXME more multiref search?
3487                 check_4block_inter(s, mb_x, mb_y,
3488                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3489                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3490
3491                 for(i=0; i<4; i++)
3492                     if(!(b[i]->type&BLOCK_INTRA))
3493                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3494
3495                 if(init_rd != best_rd)
3496                     change++;
3497             }
3498         }
3499         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3500     }
3501 }
3502
3503 static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3504     const int level= b->level;
3505     const int w= b->width;
3506     const int h= b->height;
3507     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3508     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3509     int x,y, thres1, thres2;
3510 //    START_TIMER
3511
3512     if(s->qlog == LOSSLESS_QLOG) return;
3513
3514     bias= bias ? 0 : (3*qmul)>>3;
3515     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3516     thres2= 2*thres1;
3517
3518     if(!bias){
3519         for(y=0; y<h; y++){
3520             for(x=0; x<w; x++){
3521                 int i= src[x + y*stride];
3522
3523                 if((unsigned)(i+thres1) > thres2){
3524                     if(i>=0){
3525                         i<<= QEXPSHIFT;
3526                         i/= qmul; //FIXME optimize
3527                         src[x + y*stride]=  i;
3528                     }else{
3529                         i= -i;
3530                         i<<= QEXPSHIFT;
3531                         i/= qmul; //FIXME optimize
3532                         src[x + y*stride]= -i;
3533                     }
3534                 }else
3535                     src[x + y*stride]= 0;
3536             }
3537         }
3538     }else{
3539         for(y=0; y<h; y++){
3540             for(x=0; x<w; x++){
3541                 int i= src[x + y*stride];
3542
3543                 if((unsigned)(i+thres1) > thres2){
3544                     if(i>=0){
3545                         i<<= QEXPSHIFT;
3546                         i= (i + bias) / qmul; //FIXME optimize
3547                         src[x + y*stride]=  i;
3548                     }else{
3549                         i= -i;
3550                         i<<= QEXPSHIFT;
3551                         i= (i + bias) / qmul; //FIXME optimize
3552                         src[x + y*stride]= -i;
3553                     }
3554                 }else
3555                     src[x + y*stride]= 0;
3556             }
3557         }
3558     }
3559     if(level+1 == s->spatial_decomposition_count){
3560 //        STOP_TIMER("quantize")
3561     }
3562 }
3563
3564 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3565     const int w= b->width;
3566     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3567     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3568     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3569     int x,y;
3570     START_TIMER
3571
3572     if(s->qlog == LOSSLESS_QLOG) return;
3573
3574     for(y=start_y; y<end_y; y++){
3575 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3576         DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3577         for(x=0; x<w; x++){
3578             int i= line[x];
3579             if(i<0){
3580                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3581             }else if(i>0){
3582                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3583             }
3584         }
3585     }
3586     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3587         STOP_TIMER("dquant")
3588     }
3589 }
3590
3591 static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3592     const int w= b->width;
3593     const int h= b->height;
3594     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3595     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3596     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3597     int x,y;
3598     START_TIMER
3599
3600     if(s->qlog == LOSSLESS_QLOG) return;
3601
3602     for(y=0; y<h; y++){
3603         for(x=0; x<w; x++){
3604             int i= src[x + y*stride];
3605             if(i<0){
3606                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3607             }else if(i>0){
3608                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3609             }
3610         }
3611     }
3612     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3613         STOP_TIMER("dquant")
3614     }
3615 }
3616
3617 static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3618     const int w= b->width;
3619     const int h= b->height;
3620     int x,y;
3621
3622     for(y=h-1; y>=0; y--){
3623         for(x=w-1; x>=0; x--){
3624             int i= x + y*stride;
3625
3626             if(x){
3627                 if(use_median){
3628                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3629                     else  src[i] -= src[i - 1];
3630                 }else{
3631                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3632                     else  src[i] -= src[i - 1];
3633                 }
3634             }else{
3635                 if(y) src[i] -= src[i - stride];
3636             }
3637         }
3638     }
3639 }
3640
3641 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3642     const int w= b->width;
3643     int x,y;
3644
3645 //    START_TIMER
3646
3647     DWTELEM * line;
3648     DWTELEM * prev;
3649
3650     if (start_y != 0)
3651         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3652
3653     for(y=start_y; y<end_y; y++){
3654         prev = line;
3655 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3656         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3657         for(x=0; x<w; x++){
3658             if(x){
3659                 if(use_median){
3660                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3661                     else  line[x] += line[x - 1];
3662                 }else{
3663                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3664                     else  line[x] += line[x - 1];
3665                 }
3666             }else{
3667                 if(y) line[x] += prev[x];
3668             }
3669         }
3670     }
3671
3672 //    STOP_TIMER("correlate")
3673 }
3674
3675 static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3676     const int w= b->width;
3677     const int h= b->height;
3678     int x,y;
3679
3680     for(y=0; y<h; y++){
3681         for(x=0; x<w; x++){
3682             int i= x + y*stride;
3683
3684             if(x){
3685                 if(use_median){
3686                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3687                     else  src[i] += src[i - 1];
3688                 }else{
3689                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3690                     else  src[i] += src[i - 1];
3691                 }
3692             }else{
3693                 if(y) src[i] += src[i - stride];
3694             }
3695         }
3696     }
3697 }
3698
3699 static void encode_header(SnowContext *s){
3700     int plane_index, level, orientation;
3701     uint8_t kstate[32];
3702
3703     memset(kstate, MID_STATE, sizeof(kstate));
3704
3705     put_rac(&s->c, kstate, s->keyframe);
3706     if(s->keyframe || s->always_reset)
3707         reset_contexts(s);
3708     if(s->keyframe){
3709         put_symbol(&s->c, s->header_state, s->version, 0);
3710         put_rac(&s->c, s->header_state, s->always_reset);
3711         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3712         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3713         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3714         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3715         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3716         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3717         put_rac(&s->c, s->header_state, s->spatial_scalability);
3718 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3719         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3720
3721         for(plane_index=0; plane_index<2; plane_index++){
3722             for(level=0; level<s->spatial_decomposition_count; level++){
3723                 for(orientation=level ? 1:0; orientation<4; orientation++){
3724                     if(orientation==2) continue;
3725                     put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3726                 }
3727             }
3728         }
3729     }
3730     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type, 0);
3731     put_symbol(&s->c, s->header_state, s->qlog, 1);
3732     put_symbol(&s->c, s->header_state, s->mv_scale, 0);
3733     put_symbol(&s->c, s->header_state, s->qbias, 1);
3734     put_symbol(&s->c, s->header_state, s->block_max_depth, 0);
3735 }
3736
3737 static int decode_header(SnowContext *s){
3738     int plane_index, level, orientation;
3739     uint8_t kstate[32];
3740
3741     memset(kstate, MID_STATE, sizeof(kstate));
3742
3743     s->keyframe= get_rac(&s->c, kstate);
3744     if(s->keyframe || s->always_reset)
3745         reset_contexts(s);
3746     if(s->keyframe){
3747         s->version= get_symbol(&s->c, s->header_state, 0);
3748         if(s->version>0){
3749             av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3750             return -1;
3751         }
3752         s->always_reset= get_rac(&s->c, s->header_state);
3753         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3754         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3755         s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3756         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3757         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3758         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3759         s->spatial_scalability= get_rac(&s->c, s->header_state);
3760 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3761         s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3762
3763         for(plane_index=0; plane_index<3; plane_index++){
3764             for(level=0; level<s->spatial_decomposition_count; level++){
3765                 for(orientation=level ? 1:0; orientation<4; orientation++){
3766                     int q;
3767                     if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3768                     else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3769                     else                    q= get_symbol(&s->c, s->header_state, 1);
3770                     s->plane[plane_index].band[level][orientation].qlog= q;
3771                 }
3772             }
3773         }
3774     }
3775
3776     s->spatial_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3777     if(s->spatial_decomposition_type > 2){
3778         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3779         return -1;
3780     }
3781
3782     s->qlog= get_symbol(&s->c, s->header_state, 1);
3783     s->mv_scale= get_symbol(&s->c, s->header_state, 0);
3784     s->qbias= get_symbol(&s->c, s->header_state, 1);
3785     s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
3786     if(s->block_max_depth > 1 || s->block_max_depth < 0){
3787         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3788         s->block_max_depth= 0;
3789         return -1;
3790     }
3791
3792     return 0;
3793 }
3794
3795 static void init_qexp(void){
3796     int i;
3797     double v=128;
3798
3799     for(i=0; i<QROOT; i++){
3800         qexp[i]= lrintf(v);
3801         v *= pow(2, 1.0 / QROOT);
3802     }
3803 }
3804
3805 static int common_init(AVCodecContext *avctx){
3806     SnowContext *s = avctx->priv_data;
3807     int width, height;
3808     int level, orientation, plane_index, dec;
3809
3810     s->avctx= avctx;
3811
3812     dsputil_init(&s->dsp, avctx);
3813
3814 #define mcf(dx,dy)\
3815     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3816     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3817         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3818     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3819     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3820         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3821
3822     mcf( 0, 0)
3823     mcf( 4, 0)
3824     mcf( 8, 0)
3825     mcf(12, 0)
3826     mcf( 0, 4)
3827     mcf( 4, 4)
3828     mcf( 8, 4)
3829     mcf(12, 4)
3830     mcf( 0, 8)
3831     mcf( 4, 8)
3832     mcf( 8, 8)
3833     mcf(12, 8)
3834     mcf( 0,12)
3835     mcf( 4,12)
3836     mcf( 8,12)
3837     mcf(12,12)
3838
3839 #define mcfh(dx,dy)\
3840     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3841     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3842         mc_block_hpel ## dx ## dy ## 16;\
3843     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3844     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3845         mc_block_hpel ## dx ## dy ## 8;
3846
3847     mcfh(0, 0)
3848     mcfh(8, 0)
3849     mcfh(0, 8)
3850     mcfh(8, 8)
3851
3852     if(!qexp[0])
3853         init_qexp();
3854
3855     dec= s->spatial_decomposition_count= 5;
3856     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3857
3858     s->chroma_h_shift= 1; //FIXME XXX
3859     s->chroma_v_shift= 1;
3860
3861 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3862
3863     width= s->avctx->width;
3864     height= s->avctx->height;
3865
3866     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3867
3868     s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3869     s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3870
3871     for(plane_index=0; plane_index<3; plane_index++){
3872         int w= s->avctx->width;
3873         int h= s->avctx->height;
3874
3875         if(plane_index){
3876             w>>= s->chroma_h_shift;
3877             h>>= s->chroma_v_shift;
3878         }
3879         s->plane[plane_index].width = w;
3880         s->plane[plane_index].height= h;
3881 //av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3882         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3883             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3884                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3885
3886                 b->buf= s->spatial_dwt_buffer;
3887                 b->level= level;
3888                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3889                 b->width = (w + !(orientation&1))>>1;
3890                 b->height= (h + !(orientation>1))>>1;
3891
3892                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3893                 b->buf_x_offset = 0;
3894                 b->buf_y_offset = 0;
3895
3896                 if(orientation&1){
3897                     b->buf += (w+1)>>1;
3898                     b->buf_x_offset = (w+1)>>1;
3899                 }
3900                 if(orientation>1){
3901                     b->buf += b->stride>>1;
3902                     b->buf_y_offset = b->stride_line >> 1;
3903                 }
3904
3905                 if(level)
3906                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3907                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3908             }
3909             w= (w+1)>>1;
3910             h= (h+1)>>1;
3911         }
3912     }
3913
3914     reset_contexts(s);
3915 /*
3916     width= s->width= avctx->width;
3917     height= s->height= avctx->height;
3918
3919     assert(width && height);
3920 */
3921     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3922
3923     return 0;
3924 }
3925
3926 static int qscale2qlog(int qscale){
3927     return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3928            + 61*QROOT/8; //<64 >60
3929 }
3930
3931 static void ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3932 {
3933     /* estimate the frame's complexity as a sum of weighted dwt coefs.
3934      * FIXME we know exact mv bits at this point,
3935      * but ratecontrol isn't set up to include them. */
3936     uint32_t coef_sum= 0;
3937     int level, orientation;
3938
3939     for(level=0; level<s->spatial_decomposition_count; level++){
3940         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3941             SubBand *b= &s->plane[0].band[level][orientation];
3942             DWTELEM *buf= b->buf;
3943             const int w= b->width;
3944             const int h= b->height;
3945             const int stride= b->stride;
3946             const int qlog= clip(2*QROOT + b->qlog, 0, QROOT*16);
3947             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3948             const int qdiv= (1<<16)/qmul;
3949             int x, y;
3950             if(orientation==0)
3951                 decorrelate(s, b, buf, stride, 1, 0);
3952             for(y=0; y<h; y++)
3953                 for(x=0; x<w; x++)
3954                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3955             if(orientation==0)
3956                 correlate(s, b, buf, stride, 1, 0);
3957         }
3958     }
3959
3960     /* ugly, ratecontrol just takes a sqrt again */
3961     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3962     assert(coef_sum < INT_MAX);
3963
3964     if(pict->pict_type == I_TYPE){
3965         s->m.current_picture.mb_var_sum= coef_sum;
3966         s->m.current_picture.mc_mb_var_sum= 0;
3967     }else{
3968         s->m.current_picture.mc_mb_var_sum= coef_sum;
3969         s->m.current_picture.mb_var_sum= 0;
3970     }
3971
3972     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3973     s->lambda= pict->quality * 3/2;
3974     s->qlog= qscale2qlog(pict->quality);
3975 }
3976
3977 static void calculate_vissual_weight(SnowContext *s, Plane *p){
3978     int width = p->width;
3979     int height= p->height;
3980     int level, orientation, x, y;
3981
3982     for(level=0; level<s->spatial_decomposition_count; level++){
3983         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3984             SubBand *b= &p->band[level][orientation];
3985             DWTELEM *buf= b->buf;
3986             int64_t error=0;
3987
3988             memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3989             buf[b->width/2 + b->height/2*b->stride]= 256*256;
3990             ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3991             for(y=0; y<height; y++){
3992                 for(x=0; x<width; x++){
3993                     int64_t d= s->spatial_dwt_buffer[x + y*width];
3994                     error += d*d;
3995                 }
3996             }
3997
3998             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3999 //            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
4000         }
4001     }
4002 }
4003
4004 static int encode_init(AVCodecContext *avctx)
4005 {
4006     SnowContext *s = avctx->priv_data;
4007     int plane_index;
4008
4009     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
4010         av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
4011                "use vstrict=-2 / -strict -2 to use it anyway\n");
4012         return -1;
4013     }
4014
4015     common_init(avctx);
4016     alloc_blocks(s);
4017
4018     s->version=0;
4019
4020     s->m.avctx   = avctx;
4021     s->m.flags   = avctx->flags;
4022     s->m.bit_rate= avctx->bit_rate;
4023
4024     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
4025     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4026     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4027     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
4028     h263_encode_init(&s->m); //mv_penalty
4029
4030     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
4031
4032     if(avctx->flags&CODEC_FLAG_PASS1){
4033         if(!avctx->stats_out)
4034             avctx->stats_out = av_mallocz(256);
4035     }
4036     if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
4037         if(ff_rate_control_init(&s->m) < 0)
4038             return -1;
4039     }
4040     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
4041
4042     for(plane_index=0; plane_index<3; plane_index++){
4043         calculate_vissual_weight(s, &s->plane[plane_index]);
4044     }
4045
4046
4047     avctx->coded_frame= &s->current_picture;
4048     switch(avctx->pix_fmt){
4049 //    case PIX_FMT_YUV444P:
4050 //    case PIX_FMT_YUV422P:
4051     case PIX_FMT_YUV420P:
4052     case PIX_FMT_GRAY8:
4053 //    case PIX_FMT_YUV411P:
4054 //    case PIX_FMT_YUV410P:
4055         s->colorspace_type= 0;
4056         break;
4057 /*    case PIX_FMT_RGBA32:
4058         s->colorspace= 1;
4059         break;*/
4060     default:
4061         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
4062         return -1;
4063     }
4064 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4065     s->chroma_h_shift= 1;
4066     s->chroma_v_shift= 1;
4067
4068     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4069     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4070
4071     s->avctx->get_buffer(s->avctx, &s->input_picture);
4072
4073     if(s->avctx->me_method == ME_ITER){
4074         int i;
4075         int size= s->b_width * s->b_height << 2*s->block_max_depth;
4076         for(i=0; i<s->max_ref_frames; i++){
4077             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4078             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4079         }
4080     }
4081
4082     return 0;
4083 }
4084
4085 static int frame_start(SnowContext *s){
4086    AVFrame tmp;
4087    int w= s->avctx->width; //FIXME round up to x16 ?
4088    int h= s->avctx->height;
4089
4090     if(s->current_picture.data[0]){
4091         draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4092         draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4093         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4094     }
4095
4096     tmp= s->last_picture[s->max_ref_frames-1];
4097     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4098     s->last_picture[0]= s->current_picture;
4099     s->current_picture= tmp;
4100
4101     if(s->keyframe){
4102         s->ref_frames= 0;
4103     }else{
4104         int i;
4105         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4106             if(i && s->last_picture[i-1].key_frame)
4107                 break;
4108         s->ref_frames= i;
4109     }
4110
4111     s->current_picture.reference= 1;
4112     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4113         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4114         return -1;
4115     }
4116
4117     s->current_picture.key_frame= s->keyframe;
4118
4119     return 0;
4120 }
4121
4122 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4123     SnowContext *s = avctx->priv_data;
4124     RangeCoder * const c= &s->c;
4125     AVFrame *pict = data;
4126     const int width= s->avctx->width;
4127     const int height= s->avctx->height;
4128     int level, orientation, plane_index, i, y;
4129
4130     ff_init_range_encoder(c, buf, buf_size);
4131     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4132
4133     for(i=0; i<3; i++){
4134         int shift= !!i;
4135         for(y=0; y<(height>>shift); y++)
4136             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
4137                    &pict->data[i][y * pict->linesize[i]],
4138                    width>>shift);
4139     }
4140     s->new_picture = *pict;
4141
4142     s->m.picture_number= avctx->frame_number;
4143     if(avctx->flags&CODEC_FLAG_PASS2){
4144         s->m.pict_type =
4145         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
4146         s->keyframe= pict->pict_type==FF_I_TYPE;
4147         if(!(avctx->flags&CODEC_FLAG_QSCALE))
4148             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4149     }else{
4150         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4151         s->m.pict_type=
4152         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4153     }
4154
4155     if(s->pass1_rc && avctx->frame_number == 0)
4156         pict->quality= 2*FF_QP2LAMBDA;
4157     if(pict->quality){
4158         s->qlog= qscale2qlog(pict->quality);
4159         s->lambda = pict->quality * 3/2;
4160     }
4161     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
4162         s->qlog= LOSSLESS_QLOG;
4163         s->lambda = 0;
4164     }//else keep previous frame's qlog until after motion est
4165
4166     frame_start(s);
4167
4168     s->m.current_picture_ptr= &s->m.current_picture;
4169     if(pict->pict_type == P_TYPE){
4170         int block_width = (width +15)>>4;
4171         int block_height= (height+15)>>4;
4172         int stride= s->current_picture.linesize[0];
4173
4174         assert(s->current_picture.data[0]);
4175         assert(s->last_picture[0].data[0]);
4176
4177         s->m.avctx= s->avctx;
4178         s->m.current_picture.data[0]= s->current_picture.data[0];
4179         s->m.   last_picture.data[0]= s->last_picture[0].data[0];
4180         s->m.    new_picture.data[0]= s->  input_picture.data[0];
4181         s->m.   last_picture_ptr= &s->m.   last_picture;
4182         s->m.linesize=
4183         s->m.   last_picture.linesize[0]=
4184         s->m.    new_picture.linesize[0]=
4185         s->m.current_picture.linesize[0]= stride;
4186         s->m.uvlinesize= s->current_picture.linesize[1];
4187         s->m.width = width;
4188         s->m.height= height;
4189         s->m.mb_width = block_width;
4190         s->m.mb_height= block_height;
4191         s->m.mb_stride=   s->m.mb_width+1;
4192         s->m.b8_stride= 2*s->m.mb_width+1;
4193         s->m.f_code=1;
4194         s->m.pict_type= pict->pict_type;
4195         s->m.me_method= s->avctx->me_method;
4196         s->m.me.scene_change_score=0;
4197         s->m.flags= s->avctx->flags;
4198         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
4199         s->m.out_format= FMT_H263;
4200         s->m.unrestricted_mv= 1;
4201
4202         s->m.lambda = s->lambda;
4203         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
4204         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
4205
4206         s->m.dsp= s->dsp; //move
4207         ff_init_me(&s->m);
4208         s->dsp= s->m.dsp;
4209     }
4210
4211 redo_frame:
4212
4213     s->m.pict_type = pict->pict_type;
4214     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
4215
4216     encode_header(s);
4217     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4218     encode_blocks(s);
4219     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
4220
4221     for(plane_index=0; plane_index<3; plane_index++){
4222         Plane *p= &s->plane[plane_index];
4223         int w= p->width;
4224         int h= p->height;
4225         int x, y;
4226 //        int bits= put_bits_count(&s->c.pb);
4227
4228         //FIXME optimize
4229      if(pict->data[plane_index]) //FIXME gray hack
4230         for(y=0; y<h; y++){
4231             for(x=0; x<w; x++){
4232                 s->spatial_dwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
4233             }
4234         }
4235         predict_plane(s, s->spatial_dwt_buffer, plane_index, 0);
4236
4237         if(   plane_index==0
4238            && pict->pict_type == P_TYPE
4239            && !(avctx->flags&CODEC_FLAG_PASS2)
4240            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
4241             ff_init_range_encoder(c, buf, buf_size);
4242             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4243             pict->pict_type= FF_I_TYPE;
4244             s->keyframe=1;
4245             s->current_picture.key_frame=1;
4246             reset_contexts(s);
4247             goto redo_frame;
4248         }
4249
4250         if(s->qlog == LOSSLESS_QLOG){
4251             for(y=0; y<h; y++){
4252                 for(x=0; x<w; x++){
4253                     s->spatial_dwt_buffer[y*w + x]= (s->spatial_dwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
4254                 }
4255             }
4256         }
4257
4258         ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4259
4260         if(s->pass1_rc && plane_index==0)
4261             ratecontrol_1pass(s, pict);
4262
4263         for(level=0; level<s->spatial_decomposition_count; level++){
4264             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4265                 SubBand *b= &p->band[level][orientation];
4266
4267                 quantize(s, b, b->buf, b->stride, s->qbias);
4268                 if(orientation==0)
4269                     decorrelate(s, b, b->buf, b->stride, pict->pict_type == P_TYPE, 0);
4270                 encode_subband(s, b, b->buf, b->parent ? b->parent->buf : NULL, b->stride, orientation);
4271                 assert(b->parent==NULL || b->parent->stride == b->stride*2);
4272                 if(orientation==0)
4273                     correlate(s, b, b->buf, b->stride, 1, 0);
4274             }
4275         }
4276 //        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
4277
4278         for(level=0; level<s->spatial_decomposition_count; level++){
4279             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4280                 SubBand *b= &p->band[level][orientation];
4281
4282                 dequantize(s, b, b->buf, b->stride);
4283             }
4284         }
4285
4286         ff_spatial_idwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4287         if(s->qlog == LOSSLESS_QLOG){
4288             for(y=0; y<h; y++){
4289                 for(x=0; x<w; x++){
4290                     s->spatial_dwt_buffer[y*w + x]<<=FRAC_BITS;
4291                 }
4292             }
4293         }
4294 {START_TIMER
4295         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4296 STOP_TIMER("pred-conv")}
4297         if(s->avctx->flags&CODEC_FLAG_PSNR){
4298             int64_t error= 0;
4299
4300     if(pict->data[plane_index]) //FIXME gray hack
4301             for(y=0; y<h; y++){
4302                 for(x=0; x<w; x++){
4303                     int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4304                     error += d*d;
4305                 }
4306             }
4307             s->avctx->error[plane_index] += error;
4308             s->current_picture.error[plane_index] = error;
4309         }
4310     }
4311
4312     if(s->last_picture[s->max_ref_frames-1].data[0])
4313         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4314
4315     s->current_picture.coded_picture_number = avctx->frame_number;
4316     s->current_picture.pict_type = pict->pict_type;
4317     s->current_picture.quality = pict->quality;
4318     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4319     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
4320     s->m.current_picture.display_picture_number =
4321     s->m.current_picture.coded_picture_number = avctx->frame_number;
4322     s->m.current_picture.quality = pict->quality;
4323     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4324     if(s->pass1_rc)
4325         ff_rate_estimate_qscale(&s->m, 0);
4326     if(avctx->flags&CODEC_FLAG_PASS1)
4327         ff_write_pass1_stats(&s->m);
4328     s->m.last_pict_type = s->m.pict_type;
4329
4330     emms_c();
4331
4332     return ff_rac_terminate(c);
4333 }
4334
4335 static void common_end(SnowContext *s){
4336     int plane_index, level, orientation, i;
4337
4338     av_freep(&s->spatial_dwt_buffer);
4339
4340     av_freep(&s->m.me.scratchpad);
4341     av_freep(&s->m.me.map);
4342     av_freep(&s->m.me.score_map);
4343     av_freep(&s->m.obmc_scratchpad);
4344
4345     av_freep(&s->block);
4346
4347     for(i=0; i<MAX_REF_FRAMES; i++){
4348         av_freep(&s->ref_mvs[i]);
4349         av_freep(&s->ref_scores[i]);
4350         if(s->last_picture[i].data[0])
4351             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
4352     }
4353
4354     for(plane_index=0; plane_index<3; plane_index++){
4355         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4356             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4357                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4358
4359                 av_freep(&b->x_coeff);
4360             }
4361         }
4362     }
4363 }
4364
4365 static int encode_end(AVCodecContext *avctx)
4366 {
4367     SnowContext *s = avctx->priv_data;
4368
4369     common_end(s);
4370     av_free(avctx->stats_out);
4371
4372     return 0;
4373 }
4374
4375 static int decode_init(AVCodecContext *avctx)
4376 {
4377     SnowContext *s = avctx->priv_data;
4378     int block_size;
4379
4380     avctx->pix_fmt= PIX_FMT_YUV420P;
4381
4382     common_init(avctx);
4383
4384     block_size = MB_SIZE >> s->block_max_depth;
4385     slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 3)) + 1, s->plane[0].width, s->spatial_dwt_buffer);
4386
4387     return 0;
4388 }
4389
4390 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
4391     SnowContext *s = avctx->priv_data;
4392     RangeCoder * const c= &s->c;
4393     int bytes_read;
4394     AVFrame *picture = data;
4395     int level, orientation, plane_index;
4396
4397     ff_init_range_decoder(c, buf, buf_size);
4398     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4399
4400     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4401     decode_header(s);
4402     if(!s->block) alloc_blocks(s);
4403
4404     frame_start(s);
4405     //keyframe flag dupliaction mess FIXME
4406     if(avctx->debug&FF_DEBUG_PICT_INFO)
4407         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4408
4409     decode_blocks(s);
4410
4411     for(plane_index=0; plane_index<3; plane_index++){
4412         Plane *p= &s->plane[plane_index];
4413         int w= p->width;
4414         int h= p->height;
4415         int x, y;
4416         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4417
4418 if(s->avctx->debug&2048){
4419         memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4420         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4421
4422         for(y=0; y<h; y++){
4423             for(x=0; x<w; x++){
4424                 int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4425                 s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4426             }
4427         }
4428 }
4429
4430 {   START_TIMER
4431     for(level=0; level<s->spatial_decomposition_count; level++){
4432         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4433             SubBand *b= &p->band[level][orientation];
4434             unpack_coeffs(s, b, b->parent, orientation);
4435         }
4436     }
4437     STOP_TIMER("unpack coeffs");
4438 }
4439
4440 {START_TIMER
4441     const int mb_h= s->b_height << s->block_max_depth;
4442     const int block_size = MB_SIZE >> s->block_max_depth;
4443     const int block_w    = plane_index ? block_size/2 : block_size;
4444     int mb_y;
4445     dwt_compose_t cs[MAX_DECOMPOSITIONS];
4446     int yd=0, yq=0;
4447     int y;
4448     int end_y;
4449
4450     ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4451     for(mb_y=0; mb_y<=mb_h; mb_y++){
4452
4453         int slice_starty = block_w*mb_y;
4454         int slice_h = block_w*(mb_y+1);
4455         if (!(s->keyframe || s->avctx->debug&512)){
4456             slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4457             slice_h -= (block_w >> 1);
4458         }
4459
4460         {
4461         START_TIMER
4462         for(level=0; level<s->spatial_decomposition_count; level++){
4463             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4464                 SubBand *b= &p->band[level][orientation];
4465                 int start_y;
4466                 int end_y;
4467                 int our_mb_start = mb_y;
4468                 int our_mb_end = (mb_y + 1);
4469                 const int extra= 3;
4470                 start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4471                 end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4472                 if (!(s->keyframe || s->avctx->debug&512)){
4473                     start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4474                     end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4475                 }
4476                 start_y = FFMIN(b->height, start_y);
4477                 end_y = FFMIN(b->height, end_y);
4478
4479                 if (start_y != end_y){
4480                     if (orientation == 0){
4481                         SubBand * correlate_band = &p->band[0][0];
4482                         int correlate_end_y = FFMIN(b->height, end_y + 1);
4483                         int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4484                         decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4485                         correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4486                         dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, start_y, end_y);
4487                     }
4488                     else
4489                         decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4490                 }
4491             }
4492         }
4493         STOP_TIMER("decode_subband_slice");
4494         }
4495
4496 {   START_TIMER
4497         for(; yd<slice_h; yd+=4){
4498             ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4499         }
4500     STOP_TIMER("idwt slice");}
4501
4502
4503         if(s->qlog == LOSSLESS_QLOG){
4504             for(; yq<slice_h && yq<h; yq++){
4505                 DWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4506                 for(x=0; x<w; x++){
4507                     line[x] <<= FRAC_BITS;
4508                 }
4509             }
4510         }
4511
4512         predict_slice_buffered(s, &s->sb, s->spatial_dwt_buffer, plane_index, 1, mb_y);
4513
4514         y = FFMIN(p->height, slice_starty);
4515         end_y = FFMIN(p->height, slice_h);
4516         while(y < end_y)
4517             slice_buffer_release(&s->sb, y++);
4518     }
4519
4520     slice_buffer_flush(&s->sb);
4521
4522 STOP_TIMER("idwt + predict_slices")}
4523     }
4524
4525     emms_c();
4526
4527     if(s->last_picture[s->max_ref_frames-1].data[0])
4528         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4529
4530 if(!(s->avctx->debug&2048))
4531     *picture= s->current_picture;
4532 else
4533     *picture= s->mconly_picture;
4534
4535     *data_size = sizeof(AVFrame);
4536
4537     bytes_read= c->bytestream - c->bytestream_start;
4538     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4539
4540     return bytes_read;
4541 }
4542
4543 static int decode_end(AVCodecContext *avctx)
4544 {
4545     SnowContext *s = avctx->priv_data;
4546
4547     slice_buffer_destroy(&s->sb);
4548
4549     common_end(s);
4550
4551     return 0;
4552 }
4553
4554 AVCodec snow_decoder = {
4555     "snow",
4556     CODEC_TYPE_VIDEO,
4557     CODEC_ID_SNOW,
4558     sizeof(SnowContext),
4559     decode_init,
4560     NULL,
4561     decode_end,
4562     decode_frame,
4563     0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4564     NULL
4565 };
4566
4567 #ifdef CONFIG_ENCODERS
4568 AVCodec snow_encoder = {
4569     "snow",
4570     CODEC_TYPE_VIDEO,
4571     CODEC_ID_SNOW,
4572     sizeof(SnowContext),
4573     encode_init,
4574     encode_frame,
4575     encode_end,
4576 };
4577 #endif
4578
4579
4580 #if 0
4581 #undef malloc
4582 #undef free
4583 #undef printf
4584
4585 int main(){
4586     int width=256;
4587     int height=256;
4588     int buffer[2][width*height];
4589     SnowContext s;
4590     int i;
4591     s.spatial_decomposition_count=6;
4592     s.spatial_decomposition_type=1;
4593
4594     printf("testing 5/3 DWT\n");
4595     for(i=0; i<width*height; i++)
4596         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4597
4598     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4599     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4600
4601     for(i=0; i<width*height; i++)
4602         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4603
4604     printf("testing 9/7 DWT\n");
4605     s.spatial_decomposition_type=0;
4606     for(i=0; i<width*height; i++)
4607         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4608
4609     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4610     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4611
4612     for(i=0; i<width*height; i++)
4613         if(ABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4614
4615 #if 0
4616     printf("testing AC coder\n");
4617     memset(s.header_state, 0, sizeof(s.header_state));
4618     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4619     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4620
4621     for(i=-256; i<256; i++){
4622 START_TIMER
4623         put_symbol(&s.c, s.header_state, i*i*i/3*ABS(i), 1);
4624 STOP_TIMER("put_symbol")
4625     }
4626     ff_rac_terminate(&s.c);
4627
4628     memset(s.header_state, 0, sizeof(s.header_state));
4629     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4630     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4631
4632     for(i=-256; i<256; i++){
4633         int j;
4634 START_TIMER
4635         j= get_symbol(&s.c, s.header_state, 1);
4636 STOP_TIMER("get_symbol")
4637         if(j!=i*i*i/3*ABS(i)) printf("fsck: %d != %d\n", i, j);
4638     }
4639 #endif
4640 {
4641 int level, orientation, x, y;
4642 int64_t errors[8][4];
4643 int64_t g=0;
4644
4645     memset(errors, 0, sizeof(errors));
4646     s.spatial_decomposition_count=3;
4647     s.spatial_decomposition_type=0;
4648     for(level=0; level<s.spatial_decomposition_count; level++){
4649         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4650             int w= width  >> (s.spatial_decomposition_count-level);
4651             int h= height >> (s.spatial_decomposition_count-level);
4652             int stride= width  << (s.spatial_decomposition_count-level);
4653             DWTELEM *buf= buffer[0];
4654             int64_t error=0;
4655
4656             if(orientation&1) buf+=w;
4657             if(orientation>1) buf+=stride>>1;
4658
4659             memset(buffer[0], 0, sizeof(int)*width*height);
4660             buf[w/2 + h/2*stride]= 256*256;
4661             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4662             for(y=0; y<height; y++){
4663                 for(x=0; x<width; x++){
4664                     int64_t d= buffer[0][x + y*width];
4665                     error += d*d;
4666                     if(ABS(width/2-x)<9 && ABS(height/2-y)<9 && level==2) printf("%8lld ", d);
4667                 }
4668                 if(ABS(height/2-y)<9 && level==2) printf("\n");
4669             }
4670             error= (int)(sqrt(error)+0.5);
4671             errors[level][orientation]= error;
4672             if(g) g=ff_gcd(g, error);
4673             else g= error;
4674         }
4675     }
4676     printf("static int const visual_weight[][4]={\n");
4677     for(level=0; level<s.spatial_decomposition_count; level++){
4678         printf("  {");
4679         for(orientation=0; orientation<4; orientation++){
4680             printf("%8lld,", errors[level][orientation]/g);
4681         }
4682         printf("},\n");
4683     }
4684     printf("};\n");
4685     {
4686             int level=2;
4687             int orientation=3;
4688             int w= width  >> (s.spatial_decomposition_count-level);
4689             int h= height >> (s.spatial_decomposition_count-level);
4690             int stride= width  << (s.spatial_decomposition_count-level);
4691             DWTELEM *buf= buffer[0];
4692             int64_t error=0;
4693
4694             buf+=w;
4695             buf+=stride>>1;
4696
4697             memset(buffer[0], 0, sizeof(int)*width*height);
4698 #if 1
4699             for(y=0; y<height; y++){
4700                 for(x=0; x<width; x++){
4701                     int tab[4]={0,2,3,1};
4702                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4703                 }
4704             }
4705             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4706 #else
4707             for(y=0; y<h; y++){
4708                 for(x=0; x<w; x++){
4709                     buf[x + y*stride  ]=169;
4710                     buf[x + y*stride-w]=64;
4711                 }
4712             }
4713             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4714 #endif
4715             for(y=0; y<height; y++){
4716                 for(x=0; x<width; x++){
4717                     int64_t d= buffer[0][x + y*width];
4718                     error += d*d;
4719                     if(ABS(width/2-x)<9 && ABS(height/2-y)<9) printf("%8lld ", d);
4720                 }
4721                 if(ABS(height/2-y)<9) printf("\n");
4722             }
4723     }
4724
4725 }
4726     return 0;
4727 }
4728 #endif
4729