git.sesse.net Git - ffmpeg/blob - libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  */
  18
  19 #include "avcodec.h"
  20 #include "common.h"
  21 #include "dsputil.h"
  22 #include "snow.h"
  23
  24 #include "rangecoder.h"
  25
  26 #include "mpegvideo.h"
  27
  28 #undef NDEBUG
  29 #include <assert.h>
  30
  31 static const int8_t quant3[256]={
  32  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  41 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  48 };
  49 static const int8_t quant3b[256]={
  50  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  59 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 };
  67 static const int8_t quant3bA[256]={
  68  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  69  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  70  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84 };
  85 static const int8_t quant5[256]={
  86  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  87  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  88  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  95 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 102 };
 103 static const int8_t quant7[256]={
 104  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 105  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 107  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 108  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 113 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 118 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 119 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 120 };
 121 static const int8_t quant9[256]={
 122  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 123  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 124  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 125  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 131 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 137 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 138 };
 139 static const int8_t quant11[256]={
 140  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 141  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 142  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 143  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 144  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 154 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 155 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 156 };
 157 static const int8_t quant13[256]={
 158  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 159  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 160  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 161  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 162  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 163  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 167 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 171 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 173 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 174 };
 175
 176 #if 0 //64*cubic
 177 static const uint8_t obmc32[1024]={
 178  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 179  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 180  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 181  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 182  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 183  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 184  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 185  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 186  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 187  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 188  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 189  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 190  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 191  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 192  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 193  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 194  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 195  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 196  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 197  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 198  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 199  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 200  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 201  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 202  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 203  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 204  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 205  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 206  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 207  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 208  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 209  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 210 //error:0.000022
 211 };
 212 static const uint8_t obmc16[256]={
 213  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 214  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 215  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 216  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 217  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 218  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 219  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 220  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 221  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 222  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 223  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 224  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 225  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 226  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 227  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 228  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 229 //error:0.000033
 230 };
 231 #elif 1 // 64*linear
 232 static const uint8_t obmc32[1024]={
 233   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 234   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 235   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 236   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 237   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 238   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 239   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 240   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 241   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 242   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 243   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 244   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 245   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 246   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 247   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 248   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 249   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 250   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 251   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 252   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 253   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 254   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 255   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 256   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 257   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 258   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 259   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 260   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 261   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 262   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 263   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 264   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 265  //error:0.000020
 266 };
 267 static const uint8_t obmc16[256]={
 268   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 269   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 270   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 271   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 272   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 273  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 274  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 275  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 276  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 277  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 278  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 279   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 280   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 281   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 282   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 283   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 284 //error:0.000015
 285 };
 286 #else //64*cos
 287 static const uint8_t obmc32[1024]={
 288  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 289  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 290  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 291  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 292  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 293  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 294  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 295  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 296  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 297  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 298  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 299  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 300  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 301  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 302  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 303  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 304  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 305  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 306  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 307  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 308  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 309  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 310  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 311  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 312  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 313  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 314  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 315  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 316  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 317  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 318  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 319  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 320 //error:0.000022
 321 };
 322 static const uint8_t obmc16[256]={
 323  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 324  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 325  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 326  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 327  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 328  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 329  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 330  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 331  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 332  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 333  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 334  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 335  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 336  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 337  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 338  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 339 //error:0.000022
 340 };
 341 #endif
 342
 343 //linear *64
 344 static const uint8_t obmc8[64]={
 345   4, 12, 20, 28, 28, 20, 12,  4,
 346  12, 36, 60, 84, 84, 60, 36, 12,
 347  20, 60,100,140,140,100, 60, 20,
 348  28, 84,140,196,196,140, 84, 28,
 349  28, 84,140,196,196,140, 84, 28,
 350  20, 60,100,140,140,100, 60, 20,
 351  12, 36, 60, 84, 84, 60, 36, 12,
 352   4, 12, 20, 28, 28, 20, 12,  4,
 353 //error:0.000000
 354 };
 355
 356 //linear *64
 357 static const uint8_t obmc4[16]={
 358  16, 48, 48, 16,
 359  48,144,144, 48,
 360  48,144,144, 48,
 361  16, 48, 48, 16,
 362 //error:0.000000
 363 };
 364
 365 static const uint8_t *obmc_tab[4]={
 366     obmc32, obmc16, obmc8, obmc4
 367 };
 368
 369 typedef struct BlockNode{
 370     int16_t mx;
 371     int16_t my;
 372     uint8_t ref;
 373     uint8_t color[3];
 374     uint8_t type;
 375 //#define TYPE_SPLIT    1
 376 #define BLOCK_INTRA   1
 377 #define BLOCK_OPT     2
 378 //#define TYPE_NOCOLOR  4
 379     uint8_t level; //FIXME merge into type?
 380 }BlockNode;
 381
 382 static const BlockNode null_block= { //FIXME add border maybe
 383     .color= {128,128,128},
 384     .mx= 0,
 385     .my= 0,
 386     .ref= 0,
 387     .type= 0,
 388     .level= 0,
 389 };
 390
 391 #define LOG2_MB_SIZE 4
 392 #define MB_SIZE (1<<LOG2_MB_SIZE)
 393
 394 typedef struct x_and_coeff{
 395     int16_t x;
 396     uint16_t coeff;
 397 } x_and_coeff;
 398
 399 typedef struct SubBand{
 400     int level;
 401     int stride;
 402     int width;
 403     int height;
 404     int qlog;                                   ///< log(qscale)/log[2^(1/6)]
 405     DWTELEM *buf;
 406     int buf_x_offset;
 407     int buf_y_offset;
 408     int stride_line; ///< Stride measured in lines, not pixels.
 409     x_and_coeff * x_coeff;
 410     struct SubBand *parent;
 411     uint8_t state[/*7*2*/ 7 + 512][32];
 412 }SubBand;
 413
 414 typedef struct Plane{
 415     int width;
 416     int height;
 417     SubBand band[MAX_DECOMPOSITIONS][4];
 418 }Plane;
 419
 420 typedef struct SnowContext{
 421 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 422
 423     AVCodecContext *avctx;
 424     RangeCoder c;
 425     DSPContext dsp;
 426     AVFrame new_picture;
 427     AVFrame input_picture;              ///< new_picture with the internal linesizes
 428     AVFrame current_picture;
 429     AVFrame last_picture[MAX_REF_FRAMES];
 430     AVFrame mconly_picture;
 431 //     uint8_t q_context[16];
 432     uint8_t header_state[32];
 433     uint8_t block_state[128 + 32*128];
 434     int keyframe;
 435     int always_reset;
 436     int version;
 437     int spatial_decomposition_type;
 438     int temporal_decomposition_type;
 439     int spatial_decomposition_count;
 440     int temporal_decomposition_count;
 441     int max_ref_frames;
 442     int ref_frames;
 443     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 444     uint32_t *ref_scores[MAX_REF_FRAMES];
 445     DWTELEM *spatial_dwt_buffer;
 446     int colorspace_type;
 447     int chroma_h_shift;
 448     int chroma_v_shift;
 449     int spatial_scalability;
 450     int qlog;
 451     int lambda;
 452     int lambda2;
 453     int pass1_rc;
 454     int mv_scale;
 455     int qbias;
 456 #define QBIAS_SHIFT 3
 457     int b_width;
 458     int b_height;
 459     int block_max_depth;
 460     Plane plane[MAX_PLANES];
 461     BlockNode *block;
 462 #define ME_CACHE_SIZE 1024
 463     int me_cache[ME_CACHE_SIZE];
 464     int me_cache_generation;
 465     slice_buffer sb;
 466
 467     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 468 }SnowContext;
 469
 470 typedef struct {
 471     DWTELEM *b0;
 472     DWTELEM *b1;
 473     DWTELEM *b2;
 474     DWTELEM *b3;
 475     int y;
 476 } dwt_compose_t;
 477
 478 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 479 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 480
 481 static void iterative_me(SnowContext *s);
 482
 483 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
 484 {
 485     int i;
 486
 487     buf->base_buffer = base_buffer;
 488     buf->line_count = line_count;
 489     buf->line_width = line_width;
 490     buf->data_count = max_allocated_lines;
 491     buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
 492     buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
 493
 494     for (i = 0; i < max_allocated_lines; i++)
 495     {
 496       buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
 497     }
 498
 499     buf->data_stack_top = max_allocated_lines - 1;
 500 }
 501
 502 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 503 {
 504     int offset;
 505     DWTELEM * buffer;
 506
 507 //  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
 508
 509     assert(buf->data_stack_top >= 0);
 510 //  assert(!buf->line[line]);
 511     if (buf->line[line])
 512         return buf->line[line];
 513
 514     offset = buf->line_width * line;
 515     buffer = buf->data_stack[buf->data_stack_top];
 516     buf->data_stack_top--;
 517     buf->line[line] = buffer;
 518
 519 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 520
 521     return buffer;
 522 }
 523
 524 static void slice_buffer_release(slice_buffer * buf, int line)
 525 {
 526     int offset;
 527     DWTELEM * buffer;
 528
 529     assert(line >= 0 && line < buf->line_count);
 530     assert(buf->line[line]);
 531
 532     offset = buf->line_width * line;
 533     buffer = buf->line[line];
 534     buf->data_stack_top++;
 535     buf->data_stack[buf->data_stack_top] = buffer;
 536     buf->line[line] = NULL;
 537
 538 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 539 }
 540
 541 static void slice_buffer_flush(slice_buffer * buf)
 542 {
 543     int i;
 544     for (i = 0; i < buf->line_count; i++)
 545     {
 546         if (buf->line[i])
 547         {
 548 //      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
 549             slice_buffer_release(buf, i);
 550         }
 551     }
 552 }
 553
 554 static void slice_buffer_destroy(slice_buffer * buf)
 555 {
 556     int i;
 557     slice_buffer_flush(buf);
 558
 559     for (i = buf->data_count - 1; i >= 0; i--)
 560     {
 561         assert(buf->data_stack[i]);
 562         av_freep(&buf->data_stack[i]);
 563     }
 564     assert(buf->data_stack);
 565     av_freep(&buf->data_stack);
 566     assert(buf->line);
 567     av_freep(&buf->line);
 568 }
 569
 570 #ifdef __sgi
 571 // Avoid a name clash on SGI IRIX
 572 #undef qexp
 573 #endif
 574 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 575 static uint8_t qexp[QROOT];
 576
 577 static inline int mirror(int v, int m){
 578     while((unsigned)v > (unsigned)m){
 579         v=-v;
 580         if(v<0) v+= 2*m;
 581     }
 582     return v;
 583 }
 584
 585 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 586     int i;
 587
 588     if(v){
 589         const int a= ABS(v);
 590         const int e= av_log2(a);
 591 #if 1
 592         const int el= FFMIN(e, 10);
 593         put_rac(c, state+0, 0);
 594
 595         for(i=0; i<el; i++){
 596             put_rac(c, state+1+i, 1);  //1..10
 597         }
 598         for(; i<e; i++){
 599             put_rac(c, state+1+9, 1);  //1..10
 600         }
 601         put_rac(c, state+1+FFMIN(i,9), 0);
 602
 603         for(i=e-1; i>=el; i--){
 604             put_rac(c, state+22+9, (a>>i)&1); //22..31
 605         }
 606         for(; i>=0; i--){
 607             put_rac(c, state+22+i, (a>>i)&1); //22..31
 608         }
 609
 610         if(is_signed)
 611             put_rac(c, state+11 + el, v < 0); //11..21
 612 #else
 613
 614         put_rac(c, state+0, 0);
 615         if(e<=9){
 616             for(i=0; i<e; i++){
 617                 put_rac(c, state+1+i, 1);  //1..10
 618             }
 619             put_rac(c, state+1+i, 0);
 620
 621             for(i=e-1; i>=0; i--){
 622                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 623             }
 624
 625             if(is_signed)
 626                 put_rac(c, state+11 + e, v < 0); //11..21
 627         }else{
 628             for(i=0; i<e; i++){
 629                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 630             }
 631             put_rac(c, state+1+FFMIN(i,9), 0);
 632
 633             for(i=e-1; i>=0; i--){
 634                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 635             }
 636
 637             if(is_signed)
 638                 put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
 639         }
 640 #endif
 641     }else{
 642         put_rac(c, state+0, 1);
 643     }
 644 }
 645
 646 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 647     if(get_rac(c, state+0))
 648         return 0;
 649     else{
 650         int i, e, a;
 651         e= 0;
 652         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 653             e++;
 654         }
 655
 656         a= 1;
 657         for(i=e-1; i>=0; i--){
 658             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 659         }
 660
 661         if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
 662             return -a;
 663         else
 664             return a;
 665     }
 666 }
 667
 668 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 669     int i;
 670     int r= log2>=0 ? 1<<log2 : 1;
 671
 672     assert(v>=0);
 673     assert(log2>=-4);
 674
 675     while(v >= r){
 676         put_rac(c, state+4+log2, 1);
 677         v -= r;
 678         log2++;
 679         if(log2>0) r+=r;
 680     }
 681     put_rac(c, state+4+log2, 0);
 682
 683     for(i=log2-1; i>=0; i--){
 684         put_rac(c, state+31-i, (v>>i)&1);
 685     }
 686 }
 687
 688 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 689     int i;
 690     int r= log2>=0 ? 1<<log2 : 1;
 691     int v=0;
 692
 693     assert(log2>=-4);
 694
 695     while(get_rac(c, state+4+log2)){
 696         v+= r;
 697         log2++;
 698         if(log2>0) r+=r;
 699     }
 700
 701     for(i=log2-1; i>=0; i--){
 702         v+= get_rac(c, state+31-i)<<i;
 703     }
 704
 705     return v;
 706 }
 707
 708 static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 709     const int mirror_left= !highpass;
 710     const int mirror_right= (width&1) ^ highpass;
 711     const int w= (width>>1) - 1 + (highpass & width);
 712     int i;
 713
 714 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 715     if(mirror_left){
 716         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 717         dst += dst_step;
 718         src += src_step;
 719     }
 720
 721     for(i=0; i<w; i++){
 722         dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
 723     }
 724
 725     if(mirror_right){
 726         dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
 727     }
 728 }
 729
 730 #ifndef lift5
 731 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 732     const int mirror_left= !highpass;
 733     const int mirror_right= (width&1) ^ highpass;
 734     const int w= (width>>1) - 1 + (highpass & width);
 735     int i;
 736
 737     if(mirror_left){
 738         int r= 3*2*ref[0];
 739         r += r>>4;
 740         r += r>>8;
 741         dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
 742         dst += dst_step;
 743         src += src_step;
 744     }
 745
 746     for(i=0; i<w; i++){
 747         int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
 748         r += r>>4;
 749         r += r>>8;
 750         dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
 751     }
 752
 753     if(mirror_right){
 754         int r= 3*2*ref[w*ref_step];
 755         r += r>>4;
 756         r += r>>8;
 757         dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
 758     }
 759 }
 760 #endif
 761
 762 #ifndef liftS
 763 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 764     const int mirror_left= !highpass;
 765     const int mirror_right= (width&1) ^ highpass;
 766     const int w= (width>>1) - 1 + (highpass & width);
 767     int i;
 768
 769     assert(shift == 4);
 770 #define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
 771     if(mirror_left){
 772         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 773         dst += dst_step;
 774         src += src_step;
 775     }
 776
 777     for(i=0; i<w; i++){
 778         dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
 779     }
 780
 781     if(mirror_right){
 782         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 783     }
 784 }
 785 #endif
 786
 787
 788 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
 789     int x, i;
 790
 791     for(x=start; x<width; x+=2){
 792         int64_t sum=0;
 793
 794         for(i=0; i<n; i++){
 795             int x2= x + 2*i - n + 1;
 796             if     (x2<     0) x2= -x2;
 797             else if(x2>=width) x2= 2*width-x2-2;
 798             sum += coeffs[i]*(int64_t)dst[x2];
 799         }
 800         if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
 801         else        dst[x] += (sum + (1<<shift)/2)>>shift;
 802     }
 803 }
 804
 805 static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
 806     int x, y, i;
 807     for(y=start; y<height; y+=2){
 808         for(x=0; x<width; x++){
 809             int64_t sum=0;
 810
 811             for(i=0; i<n; i++){
 812                 int y2= y + 2*i - n + 1;
 813                 if     (y2<      0) y2= -y2;
 814                 else if(y2>=height) y2= 2*height-y2-2;
 815                 sum += coeffs[i]*(int64_t)dst[x + y2*stride];
 816             }
 817             if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
 818             else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
 819         }
 820     }
 821 }
 822
 823 #define SCALEX 1
 824 #define LX0 0
 825 #define LX1 1
 826
 827 #if 0 // more accurate 9/7
 828 #define N1 2
 829 #define SHIFT1 14
 830 #define COEFFS1 (int[]){-25987,-25987}
 831 #define N2 2
 832 #define SHIFT2 19
 833 #define COEFFS2 (int[]){-27777,-27777}
 834 #define N3 2
 835 #define SHIFT3 15
 836 #define COEFFS3 (int[]){28931,28931}
 837 #define N4 2
 838 #define SHIFT4 15
 839 #define COEFFS4 (int[]){14533,14533}
 840 #elif 1 // 13/7 CRF
 841 #define N1 4
 842 #define SHIFT1 4
 843 #define COEFFS1 (int[]){1,-9,-9,1}
 844 #define N2 4
 845 #define SHIFT2 4
 846 #define COEFFS2 (int[]){-1,5,5,-1}
 847 #define N3 0
 848 #define SHIFT3 1
 849 #define COEFFS3 NULL
 850 #define N4 0
 851 #define SHIFT4 1
 852 #define COEFFS4 NULL
 853 #elif 1 // 3/5
 854 #define LX0 1
 855 #define LX1 0
 856 #define SCALEX 0.5
 857 #define N1 2
 858 #define SHIFT1 1
 859 #define COEFFS1 (int[]){1,1}
 860 #define N2 2
 861 #define SHIFT2 2
 862 #define COEFFS2 (int[]){-1,-1}
 863 #define N3 0
 864 #define SHIFT3 0
 865 #define COEFFS3 NULL
 866 #define N4 0
 867 #define SHIFT4 0
 868 #define COEFFS4 NULL
 869 #elif 1 // 11/5
 870 #define N1 0
 871 #define SHIFT1 1
 872 #define COEFFS1 NULL
 873 #define N2 2
 874 #define SHIFT2 2
 875 #define COEFFS2 (int[]){-1,-1}
 876 #define N3 2
 877 #define SHIFT3 0
 878 #define COEFFS3 (int[]){-1,-1}
 879 #define N4 4
 880 #define SHIFT4 7
 881 #define COEFFS4 (int[]){-5,29,29,-5}
 882 #define SCALEX 4
 883 #elif 1 // 9/7 CDF
 884 #define N1 2
 885 #define SHIFT1 7
 886 #define COEFFS1 (int[]){-203,-203}
 887 #define N2 2
 888 #define SHIFT2 12
 889 #define COEFFS2 (int[]){-217,-217}
 890 #define N3 2
 891 #define SHIFT3 7
 892 #define COEFFS3 (int[]){113,113}
 893 #define N4 2
 894 #define SHIFT4 9
 895 #define COEFFS4 (int[]){227,227}
 896 #define SCALEX 1
 897 #elif 1 // 7/5 CDF
 898 #define N1 0
 899 #define SHIFT1 1
 900 #define COEFFS1 NULL
 901 #define N2 2
 902 #define SHIFT2 2
 903 #define COEFFS2 (int[]){-1,-1}
 904 #define N3 2
 905 #define SHIFT3 0
 906 #define COEFFS3 (int[]){-1,-1}
 907 #define N4 2
 908 #define SHIFT4 4
 909 #define COEFFS4 (int[]){3,3}
 910 #elif 1 // 9/7 MN
 911 #define N1 4
 912 #define SHIFT1 4
 913 #define COEFFS1 (int[]){1,-9,-9,1}
 914 #define N2 2
 915 #define SHIFT2 2
 916 #define COEFFS2 (int[]){1,1}
 917 #define N3 0
 918 #define SHIFT3 1
 919 #define COEFFS3 NULL
 920 #define N4 0
 921 #define SHIFT4 1
 922 #define COEFFS4 NULL
 923 #else // 13/7 CRF
 924 #define N1 4
 925 #define SHIFT1 4
 926 #define COEFFS1 (int[]){1,-9,-9,1}
 927 #define N2 4
 928 #define SHIFT2 4
 929 #define COEFFS2 (int[]){-1,5,5,-1}
 930 #define N3 0
 931 #define SHIFT3 1
 932 #define COEFFS3 NULL
 933 #define N4 0
 934 #define SHIFT4 1
 935 #define COEFFS4 NULL
 936 #endif
 937 static void horizontal_decomposeX(DWTELEM *b, int width){
 938     DWTELEM temp[width];
 939     const int width2= width>>1;
 940     const int w2= (width+1)>>1;
 941     int x;
 942
 943     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
 944     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
 945     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
 946     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
 947
 948     for(x=0; x<width2; x++){
 949         temp[x   ]= b[2*x    ];
 950         temp[x+w2]= b[2*x + 1];
 951     }
 952     if(width&1)
 953         temp[x   ]= b[2*x    ];
 954     memcpy(b, temp, width*sizeof(int));
 955 }
 956
 957 static void horizontal_composeX(DWTELEM *b, int width){
 958     DWTELEM temp[width];
 959     const int width2= width>>1;
 960     int x;
 961     const int w2= (width+1)>>1;
 962
 963     memcpy(temp, b, width*sizeof(int));
 964     for(x=0; x<width2; x++){
 965         b[2*x    ]= temp[x   ];
 966         b[2*x + 1]= temp[x+w2];
 967     }
 968     if(width&1)
 969         b[2*x    ]= temp[x   ];
 970
 971     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
 972     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
 973     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
 974     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
 975 }
 976
 977 static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
 978     int x, y;
 979
 980     for(y=0; y<height; y++){
 981         for(x=0; x<width; x++){
 982             buffer[y*stride + x] *= SCALEX;
 983         }
 984     }
 985
 986     for(y=0; y<height; y++){
 987         horizontal_decomposeX(buffer + y*stride, width);
 988     }
 989
 990     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
 991     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
 992     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
 993     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
 994 }
 995
 996 static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
 997     int x, y;
 998
 999     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
1000     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
1001     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
1002     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
1003
1004     for(y=0; y<height; y++){
1005         horizontal_composeX(buffer + y*stride, width);
1006     }
1007
1008     for(y=0; y<height; y++){
1009         for(x=0; x<width; x++){
1010             buffer[y*stride + x] /= SCALEX;
1011         }
1012     }
1013 }
1014
1015 static void horizontal_decompose53i(DWTELEM *b, int width){
1016     DWTELEM temp[width];
1017     const int width2= width>>1;
1018     int x;
1019     const int w2= (width+1)>>1;
1020
1021     for(x=0; x<width2; x++){
1022         temp[x   ]= b[2*x    ];
1023         temp[x+w2]= b[2*x + 1];
1024     }
1025     if(width&1)
1026         temp[x   ]= b[2*x    ];
1027 #if 0
1028     {
1029     int A1,A2,A3,A4;
1030     A2= temp[1       ];
1031     A4= temp[0       ];
1032     A1= temp[0+width2];
1033     A1 -= (A2 + A4)>>1;
1034     A4 += (A1 + 1)>>1;
1035     b[0+width2] = A1;
1036     b[0       ] = A4;
1037     for(x=1; x+1<width2; x+=2){
1038         A3= temp[x+width2];
1039         A4= temp[x+1     ];
1040         A3 -= (A2 + A4)>>1;
1041         A2 += (A1 + A3 + 2)>>2;
1042         b[x+width2] = A3;
1043         b[x       ] = A2;
1044
1045         A1= temp[x+1+width2];
1046         A2= temp[x+2       ];
1047         A1 -= (A2 + A4)>>1;
1048         A4 += (A1 + A3 + 2)>>2;
1049         b[x+1+width2] = A1;
1050         b[x+1       ] = A4;
1051     }
1052     A3= temp[width-1];
1053     A3 -= A2;
1054     A2 += (A1 + A3 + 2)>>2;
1055     b[width -1] = A3;
1056     b[width2-1] = A2;
1057     }
1058 #else
1059     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1060     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1061 #endif
1062 }
1063
1064 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1065     int i;
1066
1067     for(i=0; i<width; i++){
1068         b1[i] -= (b0[i] + b2[i])>>1;
1069     }
1070 }
1071
1072 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1073     int i;
1074
1075     for(i=0; i<width; i++){
1076         b1[i] += (b0[i] + b2[i] + 2)>>2;
1077     }
1078 }
1079
1080 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1081     int y;
1082     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1083     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1084
1085     for(y=-2; y<height; y+=2){
1086         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1087         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1088
1089 {START_TIMER
1090         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1091         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1092 STOP_TIMER("horizontal_decompose53i")}
1093
1094 {START_TIMER
1095         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1096         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1097 STOP_TIMER("vertical_decompose53i*")}
1098
1099         b0=b2;
1100         b1=b3;
1101     }
1102 }
1103
1104 static void horizontal_decompose97i(DWTELEM *b, int width){
1105     DWTELEM temp[width];
1106     const int w2= (width+1)>>1;
1107
1108     lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1109     liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1110     lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1111     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1112 }
1113
1114
1115 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1116     int i;
1117
1118     for(i=0; i<width; i++){
1119         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1120     }
1121 }
1122
1123 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1124     int i;
1125
1126     for(i=0; i<width; i++){
1127 #ifdef lift5
1128         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1129 #else
1130         int r= 3*(b0[i] + b2[i]);
1131         r+= r>>4;
1132         r+= r>>8;
1133         b1[i] += (r+W_CO)>>W_CS;
1134 #endif
1135     }
1136 }
1137
1138 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1139     int i;
1140
1141     for(i=0; i<width; i++){
1142 #ifdef liftS
1143         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1144 #else
1145         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1146 #endif
1147     }
1148 }
1149
1150 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1151     int i;
1152
1153     for(i=0; i<width; i++){
1154         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1155     }
1156 }
1157
1158 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1159     int y;
1160     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1161     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1162     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1163     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1164
1165     for(y=-4; y<height; y+=2){
1166         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1167         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1168
1169 {START_TIMER
1170         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1171         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1172 if(width>400){
1173 STOP_TIMER("horizontal_decompose97i")
1174 }}
1175
1176 {START_TIMER
1177         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1178         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1179         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1180         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1181
1182 if(width>400){
1183 STOP_TIMER("vertical_decompose97i")
1184 }}
1185
1186         b0=b2;
1187         b1=b3;
1188         b2=b4;
1189         b3=b5;
1190     }
1191 }
1192
1193 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1194     int level;
1195
1196     for(level=0; level<decomposition_count; level++){
1197         switch(type){
1198         case 0: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1199         case 1: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1200         case 2: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1201         }
1202     }
1203 }
1204
1205 static void horizontal_compose53i(DWTELEM *b, int width){
1206     DWTELEM temp[width];
1207     const int width2= width>>1;
1208     const int w2= (width+1)>>1;
1209     int x;
1210
1211 #if 0
1212     int A1,A2,A3,A4;
1213     A2= temp[1       ];
1214     A4= temp[0       ];
1215     A1= temp[0+width2];
1216     A1 -= (A2 + A4)>>1;
1217     A4 += (A1 + 1)>>1;
1218     b[0+width2] = A1;
1219     b[0       ] = A4;
1220     for(x=1; x+1<width2; x+=2){
1221         A3= temp[x+width2];
1222         A4= temp[x+1     ];
1223         A3 -= (A2 + A4)>>1;
1224         A2 += (A1 + A3 + 2)>>2;
1225         b[x+width2] = A3;
1226         b[x       ] = A2;
1227
1228         A1= temp[x+1+width2];
1229         A2= temp[x+2       ];
1230         A1 -= (A2 + A4)>>1;
1231         A4 += (A1 + A3 + 2)>>2;
1232         b[x+1+width2] = A1;
1233         b[x+1       ] = A4;
1234     }
1235     A3= temp[width-1];
1236     A3 -= A2;
1237     A2 += (A1 + A3 + 2)>>2;
1238     b[width -1] = A3;
1239     b[width2-1] = A2;
1240 #else
1241     lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1242     lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1243 #endif
1244     for(x=0; x<width2; x++){
1245         b[2*x    ]= temp[x   ];
1246         b[2*x + 1]= temp[x+w2];
1247     }
1248     if(width&1)
1249         b[2*x    ]= temp[x   ];
1250 }
1251
1252 static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1253     int i;
1254
1255     for(i=0; i<width; i++){
1256         b1[i] += (b0[i] + b2[i])>>1;
1257     }
1258 }
1259
1260 static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1261     int i;
1262
1263     for(i=0; i<width; i++){
1264         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1265     }
1266 }
1267
1268 static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1269     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1270     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1271     cs->y = -1;
1272 }
1273
1274 static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1275     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1276     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1277     cs->y = -1;
1278 }
1279
1280 static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1281     int y= cs->y;
1282
1283     DWTELEM *b0= cs->b0;
1284     DWTELEM *b1= cs->b1;
1285     DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1286     DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1287
1288 {START_TIMER
1289         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1290         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1291 STOP_TIMER("vertical_compose53i*")}
1292
1293 {START_TIMER
1294         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1295         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1296 STOP_TIMER("horizontal_compose53i")}
1297
1298     cs->b0 = b2;
1299     cs->b1 = b3;
1300     cs->y += 2;
1301 }
1302
1303 static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1304     int y= cs->y;
1305     DWTELEM *b0= cs->b0;
1306     DWTELEM *b1= cs->b1;
1307     DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1308     DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1309
1310 {START_TIMER
1311         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1312         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1313 STOP_TIMER("vertical_compose53i*")}
1314
1315 {START_TIMER
1316         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1317         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1318 STOP_TIMER("horizontal_compose53i")}
1319
1320     cs->b0 = b2;
1321     cs->b1 = b3;
1322     cs->y += 2;
1323 }
1324
1325 static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1326     dwt_compose_t cs;
1327     spatial_compose53i_init(&cs, buffer, height, stride);
1328     while(cs.y <= height)
1329         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1330 }
1331
1332
1333 void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1334     DWTELEM temp[width];
1335     const int w2= (width+1)>>1;
1336
1337     lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1338     lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1339     liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1340     lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1341 }
1342
1343 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1344     int i;
1345
1346     for(i=0; i<width; i++){
1347         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1348     }
1349 }
1350
1351 static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1352     int i;
1353
1354     for(i=0; i<width; i++){
1355 #ifdef lift5
1356         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1357 #else
1358         int r= 3*(b0[i] + b2[i]);
1359         r+= r>>4;
1360         r+= r>>8;
1361         b1[i] -= (r+W_CO)>>W_CS;
1362 #endif
1363     }
1364 }
1365
1366 static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1367     int i;
1368
1369     for(i=0; i<width; i++){
1370 #ifdef liftS
1371         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1372 #else
1373         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1374 #endif
1375     }
1376 }
1377
1378 static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1379     int i;
1380
1381     for(i=0; i<width; i++){
1382         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1383     }
1384 }
1385
1386 void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1387     int i;
1388
1389     for(i=0; i<width; i++){
1390 #ifndef lift5
1391         int r;
1392 #endif
1393         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1394 #ifdef lift5
1395         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1396 #else
1397         r= 3*(b2[i] + b4[i]);
1398         r+= r>>4;
1399         r+= r>>8;
1400         b3[i] -= (r+W_CO)>>W_CS;
1401 #endif
1402 #ifdef liftS
1403         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1404 #else
1405         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1406 #endif
1407         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1408     }
1409 }
1410
1411 static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1412     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1413     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1414     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1415     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1416     cs->y = -3;
1417 }
1418
1419 static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1420     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1421     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1422     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1423     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1424     cs->y = -3;
1425 }
1426
1427 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1428     int y = cs->y;
1429
1430     DWTELEM *b0= cs->b0;
1431     DWTELEM *b1= cs->b1;
1432     DWTELEM *b2= cs->b2;
1433     DWTELEM *b3= cs->b3;
1434     DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1435     DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1436
1437 {START_TIMER
1438     if(y>0 && y+4<height){
1439         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1440     }else{
1441         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1442         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1443         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1444         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1445     }
1446 if(width>400){
1447 STOP_TIMER("vertical_compose97i")}}
1448
1449 {START_TIMER
1450         if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1451         if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1452 if(width>400 && y+0<(unsigned)height){
1453 STOP_TIMER("horizontal_compose97i")}}
1454
1455     cs->b0=b2;
1456     cs->b1=b3;
1457     cs->b2=b4;
1458     cs->b3=b5;
1459     cs->y += 2;
1460 }
1461
1462 static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1463     int y = cs->y;
1464     DWTELEM *b0= cs->b0;
1465     DWTELEM *b1= cs->b1;
1466     DWTELEM *b2= cs->b2;
1467     DWTELEM *b3= cs->b3;
1468     DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1469     DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1470
1471 {START_TIMER
1472         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1473         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1474         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1475         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1476 if(width>400){
1477 STOP_TIMER("vertical_compose97i")}}
1478
1479 {START_TIMER
1480         if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1481         if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1482 if(width>400 && b0 <= b2){
1483 STOP_TIMER("horizontal_compose97i")}}
1484
1485     cs->b0=b2;
1486     cs->b1=b3;
1487     cs->b2=b4;
1488     cs->b3=b5;
1489     cs->y += 2;
1490 }
1491
1492 static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1493     dwt_compose_t cs;
1494     spatial_compose97i_init(&cs, buffer, height, stride);
1495     while(cs.y <= height)
1496         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1497 }
1498
1499 static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1500     int level;
1501     for(level=decomposition_count-1; level>=0; level--){
1502         switch(type){
1503         case 0: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1504         case 1: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1505         /* not slicified yet */
1506         case 2: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1507           av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1508         }
1509     }
1510 }
1511
1512 static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1513     int level;
1514     for(level=decomposition_count-1; level>=0; level--){
1515         switch(type){
1516         case 0: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1517         case 1: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1518         /* not slicified yet */
1519         case 2: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1520         }
1521     }
1522 }
1523
1524 static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1525     const int support = type==1 ? 3 : 5;
1526     int level;
1527     if(type==2) return;
1528
1529     for(level=decomposition_count-1; level>=0; level--){
1530         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1531             switch(type){
1532             case 0: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1533                     break;
1534             case 1: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1535                     break;
1536             case 2: break;
1537             }
1538         }
1539     }
1540 }
1541
1542 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1543     const int support = type==1 ? 3 : 5;
1544     int level;
1545     if(type==2) return;
1546
1547     for(level=decomposition_count-1; level>=0; level--){
1548         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1549             switch(type){
1550             case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1551                     break;
1552             case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1553                     break;
1554             case 2: break;
1555             }
1556         }
1557     }
1558 }
1559
1560 static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1561     if(type==2){
1562         int level;
1563         for(level=decomposition_count-1; level>=0; level--)
1564             spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1565     }else{
1566         dwt_compose_t cs[MAX_DECOMPOSITIONS];
1567         int y;
1568         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1569         for(y=0; y<height; y+=4)
1570             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1571     }
1572 }
1573
1574 static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1575     const int w= b->width;
1576     const int h= b->height;
1577     int x, y;
1578
1579     if(1){
1580         int run=0;
1581         int runs[w*h];
1582         int run_index=0;
1583         int max_index;
1584
1585         for(y=0; y<h; y++){
1586             for(x=0; x<w; x++){
1587                 int v, p=0;
1588                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1589                 v= src[x + y*stride];
1590
1591                 if(y){
1592                     t= src[x + (y-1)*stride];
1593                     if(x){
1594                         lt= src[x - 1 + (y-1)*stride];
1595                     }
1596                     if(x + 1 < w){
1597                         rt= src[x + 1 + (y-1)*stride];
1598                     }
1599                 }
1600                 if(x){
1601                     l= src[x - 1 + y*stride];
1602                     /*if(x > 1){
1603                         if(orientation==1) ll= src[y + (x-2)*stride];
1604                         else               ll= src[x - 2 + y*stride];
1605                     }*/
1606                 }
1607                 if(parent){
1608                     int px= x>>1;
1609                     int py= y>>1;
1610                     if(px<b->parent->width && py<b->parent->height)
1611                         p= parent[px + py*2*stride];
1612                 }
1613                 if(!(/*ll|*/l|lt|t|rt|p)){
1614                     if(v){
1615                         runs[run_index++]= run;
1616                         run=0;
1617                     }else{
1618                         run++;
1619                     }
1620                 }
1621             }
1622         }
1623         max_index= run_index;
1624         runs[run_index++]= run;
1625         run_index=0;
1626         run= runs[run_index++];
1627
1628         put_symbol2(&s->c, b->state[30], max_index, 0);
1629         if(run_index <= max_index)
1630             put_symbol2(&s->c, b->state[1], run, 3);
1631
1632         for(y=0; y<h; y++){
1633             if(s->c.bytestream_end - s->c.bytestream < w*40){
1634                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1635                 return -1;
1636             }
1637             for(x=0; x<w; x++){
1638                 int v, p=0;
1639                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1640                 v= src[x + y*stride];
1641
1642                 if(y){
1643                     t= src[x + (y-1)*stride];
1644                     if(x){
1645                         lt= src[x - 1 + (y-1)*stride];
1646                     }
1647                     if(x + 1 < w){
1648                         rt= src[x + 1 + (y-1)*stride];
1649                     }
1650                 }
1651                 if(x){
1652                     l= src[x - 1 + y*stride];
1653                     /*if(x > 1){
1654                         if(orientation==1) ll= src[y + (x-2)*stride];
1655                         else               ll= src[x - 2 + y*stride];
1656                     }*/
1657                 }
1658                 if(parent){
1659                     int px= x>>1;
1660                     int py= y>>1;
1661                     if(px<b->parent->width && py<b->parent->height)
1662                         p= parent[px + py*2*stride];
1663                 }
1664                 if(/*ll|*/l|lt|t|rt|p){
1665                     int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1666
1667                     put_rac(&s->c, &b->state[0][context], !!v);
1668                 }else{
1669                     if(!run){
1670                         run= runs[run_index++];
1671
1672                         if(run_index <= max_index)
1673                             put_symbol2(&s->c, b->state[1], run, 3);
1674                         assert(v);
1675                     }else{
1676                         run--;
1677                         assert(!v);
1678                     }
1679                 }
1680                 if(v){
1681                     int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1682                     int l2= 2*ABS(l) + (l<0);
1683                     int t2= 2*ABS(t) + (t<0);
1684
1685                     put_symbol2(&s->c, b->state[context + 2], ABS(v)-1, context-4);
1686                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1687                 }
1688             }
1689         }
1690     }
1691     return 0;
1692 }
1693
1694 static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1695 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1696 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1697     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1698 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1699 }
1700
1701 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1702     const int w= b->width;
1703     const int h= b->height;
1704     int x,y;
1705
1706     if(1){
1707         int run, runs;
1708         x_and_coeff *xc= b->x_coeff;
1709         x_and_coeff *prev_xc= NULL;
1710         x_and_coeff *prev2_xc= xc;
1711         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1712         x_and_coeff *prev_parent_xc= parent_xc;
1713
1714         runs= get_symbol2(&s->c, b->state[30], 0);
1715         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1716         else           run= INT_MAX;
1717
1718         for(y=0; y<h; y++){
1719             int v=0;
1720             int lt=0, t=0, rt=0;
1721
1722             if(y && prev_xc->x == 0){
1723                 rt= prev_xc->coeff;
1724             }
1725             for(x=0; x<w; x++){
1726                 int p=0;
1727                 const int l= v;
1728
1729                 lt= t; t= rt;
1730
1731                 if(y){
1732                     if(prev_xc->x <= x)
1733                         prev_xc++;
1734                     if(prev_xc->x == x + 1)
1735                         rt= prev_xc->coeff;
1736                     else
1737                         rt=0;
1738                 }
1739                 if(parent_xc){
1740                     if(x>>1 > parent_xc->x){
1741                         parent_xc++;
1742                     }
1743                     if(x>>1 == parent_xc->x){
1744                         p= parent_xc->coeff;
1745                     }
1746                 }
1747                 if(/*ll|*/l|lt|t|rt|p){
1748                     int context= av_log2(/*ABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1749
1750                     v=get_rac(&s->c, &b->state[0][context]);
1751                     if(v){
1752                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1753                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1754
1755                         xc->x=x;
1756                         (xc++)->coeff= v;
1757                     }
1758                 }else{
1759                     if(!run){
1760                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1761                         else           run= INT_MAX;
1762                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1763                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1764
1765                         xc->x=x;
1766                         (xc++)->coeff= v;
1767                     }else{
1768                         int max_run;
1769                         run--;
1770                         v=0;
1771
1772                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1773                         else  max_run= FFMIN(run, w-x-1);
1774                         if(parent_xc)
1775                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1776                         x+= max_run;
1777                         run-= max_run;
1778                     }
1779                 }
1780             }
1781             (xc++)->x= w+1; //end marker
1782             prev_xc= prev2_xc;
1783             prev2_xc= xc;
1784
1785             if(parent_xc){
1786                 if(y&1){
1787                     while(parent_xc->x != parent->width+1)
1788                         parent_xc++;
1789                     parent_xc++;
1790                     prev_parent_xc= parent_xc;
1791                 }else{
1792                     parent_xc= prev_parent_xc;
1793                 }
1794             }
1795         }
1796
1797         (xc++)->x= w+1; //end marker
1798     }
1799 }
1800
1801 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1802     const int w= b->width;
1803     int y;
1804     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
1805     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1806     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1807     int new_index = 0;
1808
1809     START_TIMER
1810
1811     if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1812         qadd= 0;
1813         qmul= 1<<QEXPSHIFT;
1814     }
1815
1816     /* If we are on the second or later slice, restore our index. */
1817     if (start_y != 0)
1818         new_index = save_state[0];
1819
1820
1821     for(y=start_y; y<h; y++){
1822         int x = 0;
1823         int v;
1824         DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1825         memset(line, 0, b->width*sizeof(DWTELEM));
1826         v = b->x_coeff[new_index].coeff;
1827         x = b->x_coeff[new_index++].x;
1828         while(x < w)
1829         {
1830             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1831             register int u= -(v&1);
1832             line[x] = (t^u) - u;
1833
1834             v = b->x_coeff[new_index].coeff;
1835             x = b->x_coeff[new_index++].x;
1836         }
1837     }
1838     if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1839         STOP_TIMER("decode_subband")
1840     }
1841
1842     /* Save our variables for the next slice. */
1843     save_state[0] = new_index;
1844
1845     return;
1846 }
1847
1848 static void reset_contexts(SnowContext *s){
1849     int plane_index, level, orientation;
1850
1851     for(plane_index=0; plane_index<3; plane_index++){
1852         for(level=0; level<s->spatial_decomposition_count; level++){
1853             for(orientation=level ? 1:0; orientation<4; orientation++){
1854                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1855             }
1856         }
1857     }
1858     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1859     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1860 }
1861
1862 static int alloc_blocks(SnowContext *s){
1863     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1864     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1865
1866     s->b_width = w;
1867     s->b_height= h;
1868
1869     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1870     return 0;
1871 }
1872
1873 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1874     uint8_t *bytestream= d->bytestream;
1875     uint8_t *bytestream_start= d->bytestream_start;
1876     *d= *s;
1877     d->bytestream= bytestream;
1878     d->bytestream_start= bytestream_start;
1879 }
1880
1881 //near copy & paste from dsputil, FIXME
1882 static int pix_sum(uint8_t * pix, int line_size, int w)
1883 {
1884     int s, i, j;
1885
1886     s = 0;
1887     for (i = 0; i < w; i++) {
1888         for (j = 0; j < w; j++) {
1889             s += pix[0];
1890             pix ++;
1891         }
1892         pix += line_size - w;
1893     }
1894     return s;
1895 }
1896
1897 //near copy & paste from dsputil, FIXME
1898 static int pix_norm1(uint8_t * pix, int line_size, int w)
1899 {
1900     int s, i, j;
1901     uint32_t *sq = squareTbl + 256;
1902
1903     s = 0;
1904     for (i = 0; i < w; i++) {
1905         for (j = 0; j < w; j ++) {
1906             s += sq[pix[0]];
1907             pix ++;
1908         }
1909         pix += line_size - w;
1910     }
1911     return s;
1912 }
1913
1914 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1915     const int w= s->b_width << s->block_max_depth;
1916     const int rem_depth= s->block_max_depth - level;
1917     const int index= (x + y*w) << rem_depth;
1918     const int block_w= 1<<rem_depth;
1919     BlockNode block;
1920     int i,j;
1921
1922     block.color[0]= l;
1923     block.color[1]= cb;
1924     block.color[2]= cr;
1925     block.mx= mx;
1926     block.my= my;
1927     block.ref= ref;
1928     block.type= type;
1929     block.level= level;
1930
1931     for(j=0; j<block_w; j++){
1932         for(i=0; i<block_w; i++){
1933             s->block[index + i + j*w]= block;
1934         }
1935     }
1936 }
1937
1938 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1939     const int offset[3]= {
1940           y*c->  stride + x,
1941         ((y*c->uvstride + x)>>1),
1942         ((y*c->uvstride + x)>>1),
1943     };
1944     int i;
1945     for(i=0; i<3; i++){
1946         c->src[0][i]= src [i];
1947         c->ref[0][i]= ref [i] + offset[i];
1948     }
1949     assert(!ref_index);
1950 }
1951
1952 //FIXME copy&paste
1953 #define P_LEFT P[1]
1954 #define P_TOP P[2]
1955 #define P_TOPRIGHT P[3]
1956 #define P_MEDIAN P[4]
1957 #define P_MV1 P[9]
1958 #define FLAG_QPEL   1 //must be 1
1959
1960 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1961     uint8_t p_buffer[1024];
1962     uint8_t i_buffer[1024];
1963     uint8_t p_state[sizeof(s->block_state)];
1964     uint8_t i_state[sizeof(s->block_state)];
1965     RangeCoder pc, ic;
1966     uint8_t *pbbak= s->c.bytestream;
1967     uint8_t *pbbak_start= s->c.bytestream_start;
1968     int score, score2, iscore, i_len, p_len, block_s, sum;
1969     const int w= s->b_width  << s->block_max_depth;
1970     const int h= s->b_height << s->block_max_depth;
1971     const int rem_depth= s->block_max_depth - level;
1972     const int index= (x + y*w) << rem_depth;
1973     const int block_w= 1<<(LOG2_MB_SIZE - level);
1974     int trx= (x+1)<<rem_depth;
1975     int try= (y+1)<<rem_depth;
1976     BlockNode *left  = x ? &s->block[index-1] : &null_block;
1977     BlockNode *top   = y ? &s->block[index-w] : &null_block;
1978     BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1979     BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
1980     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1981     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1982     int pl = left->color[0];
1983     int pcb= left->color[1];
1984     int pcr= left->color[2];
1985     int pmx= mid_pred(left->mx, top->mx, tr->mx);
1986     int pmy= mid_pred(left->my, top->my, tr->my);
1987     int mx=0, my=0;
1988     int l,cr,cb;
1989     const int stride= s->current_picture.linesize[0];
1990     const int uvstride= s->current_picture.linesize[1];
1991     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
1992                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
1993                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
1994     int P[10][2];
1995     int16_t last_mv[3][2];
1996     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
1997     const int shift= 1+qpel;
1998     MotionEstContext *c= &s->m.me;
1999     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2000     int mx_context= av_log2(2*ABS(left->mx - top->mx));
2001     int my_context= av_log2(2*ABS(left->my - top->my));
2002     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2003     int ref, best_ref, ref_score, ref_mx, ref_my;
2004
2005     assert(sizeof(s->block_state) >= 256);
2006     if(s->keyframe){
2007         set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, 0, BLOCK_INTRA);
2008         return 0;
2009     }
2010
2011 //    clip predictors / edge ?
2012
2013     P_LEFT[0]= left->mx;
2014     P_LEFT[1]= left->my;
2015     P_TOP [0]= top->mx;
2016     P_TOP [1]= top->my;
2017     P_TOPRIGHT[0]= tr->mx;
2018     P_TOPRIGHT[1]= tr->my;
2019
2020     last_mv[0][0]= s->block[index].mx;
2021     last_mv[0][1]= s->block[index].my;
2022     last_mv[1][0]= right->mx;
2023     last_mv[1][1]= right->my;
2024     last_mv[2][0]= bottom->mx;
2025     last_mv[2][1]= bottom->my;
2026
2027     s->m.mb_stride=2;
2028     s->m.mb_x=
2029     s->m.mb_y= 0;
2030     s->m.me.skip= 0;
2031
2032     assert(s->m.me.  stride ==   stride);
2033     assert(s->m.me.uvstride == uvstride);
2034
2035     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2036     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2037     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2038     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2039
2040     c->xmin = - x*block_w - 16+2;
2041     c->ymin = - y*block_w - 16+2;
2042     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2043     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2044
2045     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2046     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2047     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2048     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2049     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2050     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2051     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2052
2053     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2054     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2055
2056     if (!y) {
2057         c->pred_x= P_LEFT[0];
2058         c->pred_y= P_LEFT[1];
2059     } else {
2060         c->pred_x = P_MEDIAN[0];
2061         c->pred_y = P_MEDIAN[1];
2062     }
2063
2064     score= INT_MAX;
2065     best_ref= 0;
2066     for(ref=0; ref<s->ref_frames; ref++){
2067         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
2068
2069         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
2070                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2071
2072         assert(ref_mx >= c->xmin);
2073         assert(ref_mx <= c->xmax);
2074         assert(ref_my >= c->ymin);
2075         assert(ref_my <= c->ymax);
2076
2077         ref_score= s->m.me.sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2078         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2079         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
2080         if(s->ref_mvs[ref]){
2081             s->ref_mvs[ref][index][0]= ref_mx;
2082             s->ref_mvs[ref][index][1]= ref_my;
2083             s->ref_scores[ref][index]= ref_score;
2084         }
2085         if(score > ref_score){
2086             score= ref_score;
2087             best_ref= ref;
2088             mx= ref_mx;
2089             my= ref_my;
2090         }
2091     }
2092     //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2093
2094   //  subpel search
2095     pc= s->c;
2096     pc.bytestream_start=
2097     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2098     memcpy(p_state, s->block_state, sizeof(s->block_state));
2099
2100     if(level!=s->block_max_depth)
2101         put_rac(&pc, &p_state[4 + s_context], 1);
2102     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2103     if(s->ref_frames > 1)
2104         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
2105     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
2106     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
2107     p_len= pc.bytestream - pc.bytestream_start;
2108     score += (s->lambda2*(p_len*8
2109               + (pc.outstanding_count - s->c.outstanding_count)*8
2110               + (-av_log2(pc.range)    + av_log2(s->c.range))
2111              ))>>FF_LAMBDA_SHIFT;
2112
2113     block_s= block_w*block_w;
2114     sum = pix_sum(current_data[0], stride, block_w);
2115     l= (sum + block_s/2)/block_s;
2116     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2117
2118     block_s= block_w*block_w>>2;
2119     sum = pix_sum(current_data[1], uvstride, block_w>>1);
2120     cb= (sum + block_s/2)/block_s;
2121 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2122     sum = pix_sum(current_data[2], uvstride, block_w>>1);
2123     cr= (sum + block_s/2)/block_s;
2124 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2125
2126     ic= s->c;
2127     ic.bytestream_start=
2128     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2129     memcpy(i_state, s->block_state, sizeof(s->block_state));
2130     if(level!=s->block_max_depth)
2131         put_rac(&ic, &i_state[4 + s_context], 1);
2132     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2133     put_symbol(&ic, &i_state[32],  l-pl , 1);
2134     put_symbol(&ic, &i_state[64], cb-pcb, 1);
2135     put_symbol(&ic, &i_state[96], cr-pcr, 1);
2136     i_len= ic.bytestream - ic.bytestream_start;
2137     iscore += (s->lambda2*(i_len*8
2138               + (ic.outstanding_count - s->c.outstanding_count)*8
2139               + (-av_log2(ic.range)    + av_log2(s->c.range))
2140              ))>>FF_LAMBDA_SHIFT;
2141
2142 //    assert(score==256*256*256*64-1);
2143     assert(iscore < 255*255*256 + s->lambda2*10);
2144     assert(iscore >= 0);
2145     assert(l>=0 && l<=255);
2146     assert(pl>=0 && pl<=255);
2147
2148     if(level==0){
2149         int varc= iscore >> 8;
2150         int vard= score >> 8;
2151         if (vard <= 64 || vard < varc)
2152             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2153         else
2154             c->scene_change_score+= s->m.qscale;
2155     }
2156
2157     if(level!=s->block_max_depth){
2158         put_rac(&s->c, &s->block_state[4 + s_context], 0);
2159         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2160         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2161         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2162         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2163         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2164
2165         if(score2 < score && score2 < iscore)
2166             return score2;
2167     }
2168
2169     if(iscore < score){
2170         memcpy(pbbak, i_buffer, i_len);
2171         s->c= ic;
2172         s->c.bytestream_start= pbbak_start;
2173         s->c.bytestream= pbbak + i_len;
2174         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
2175         memcpy(s->block_state, i_state, sizeof(s->block_state));
2176         return iscore;
2177     }else{
2178         memcpy(pbbak, p_buffer, p_len);
2179         s->c= pc;
2180         s->c.bytestream_start= pbbak_start;
2181         s->c.bytestream= pbbak + p_len;
2182         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2183         memcpy(s->block_state, p_state, sizeof(s->block_state));
2184         return score;
2185     }
2186 }
2187
2188 static always_inline int same_block(BlockNode *a, BlockNode *b){
2189     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2190         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2191     }else{
2192         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2193     }
2194 }
2195
2196 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2197     const int w= s->b_width  << s->block_max_depth;
2198     const int rem_depth= s->block_max_depth - level;
2199     const int index= (x + y*w) << rem_depth;
2200     int trx= (x+1)<<rem_depth;
2201     BlockNode *b= &s->block[index];
2202     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2203     BlockNode *top   = y ? &s->block[index-w] : &null_block;
2204     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2205     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2206     int pl = left->color[0];
2207     int pcb= left->color[1];
2208     int pcr= left->color[2];
2209     int pmx= mid_pred(left->mx, top->mx, tr->mx);
2210     int pmy= mid_pred(left->my, top->my, tr->my);
2211     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2212     int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 16*!!b->ref;
2213     int my_context= av_log2(2*ABS(left->my - top->my)) + 16*!!b->ref;
2214     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2215
2216     if(s->keyframe){
2217         set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, 0, BLOCK_INTRA);
2218         return;
2219     }
2220
2221     if(level!=s->block_max_depth){
2222         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2223             put_rac(&s->c, &s->block_state[4 + s_context], 1);
2224         }else{
2225             put_rac(&s->c, &s->block_state[4 + s_context], 0);
2226             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2227             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2228             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2229             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2230             return;
2231         }
2232     }
2233     if(b->type & BLOCK_INTRA){
2234         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2235         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2236         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2237         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2238         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2239     }else{
2240         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2241         if(s->ref_frames > 1)
2242             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2243         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2244         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2245         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2246     }
2247 }
2248
2249 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2250     const int w= s->b_width << s->block_max_depth;
2251     const int rem_depth= s->block_max_depth - level;
2252     const int index= (x + y*w) << rem_depth;
2253     int trx= (x+1)<<rem_depth;
2254     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2255     BlockNode *top   = y ? &s->block[index-w] : &null_block;
2256     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2257     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2258     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2259
2260     if(s->keyframe){
2261         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2262         return;
2263     }
2264
2265     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2266         int type;
2267         int l = left->color[0];
2268         int cb= left->color[1];
2269         int cr= left->color[2];
2270         int mx= mid_pred(left->mx, top->mx, tr->mx);
2271         int my= mid_pred(left->my, top->my, tr->my);
2272         int ref = 0;
2273         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2274         int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 0*av_log2(2*ABS(tr->mx - top->mx));
2275         int my_context= av_log2(2*ABS(left->my - top->my)) + 0*av_log2(2*ABS(tr->my - top->my));
2276
2277         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2278
2279         if(type){
2280             l += get_symbol(&s->c, &s->block_state[32], 1);
2281             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2282             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2283         }else{
2284             if(s->ref_frames > 1)
2285                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2286             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2287             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2288         }
2289         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2290     }else{
2291         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2292         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2293         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2294         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2295     }
2296 }
2297
2298 static void encode_blocks(SnowContext *s){
2299     int x, y;
2300     int w= s->b_width;
2301     int h= s->b_height;
2302
2303     if(s->avctx->me_method == ME_ITER && !s->keyframe)
2304         iterative_me(s);
2305
2306     for(y=0; y<h; y++){
2307         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2308             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2309             return;
2310         }
2311         for(x=0; x<w; x++){
2312             if(s->avctx->me_method == ME_ITER)
2313                 encode_q_branch2(s, 0, x, y);
2314             else
2315                 encode_q_branch (s, 0, x, y);
2316         }
2317     }
2318 }
2319
2320 static void decode_blocks(SnowContext *s){
2321     int x, y;
2322     int w= s->b_width;
2323     int h= s->b_height;
2324
2325     for(y=0; y<h; y++){
2326         for(x=0; x<w; x++){
2327             decode_q_branch(s, 0, x, y);
2328         }
2329     }
2330 }
2331
2332 static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2333     int x, y;
2334 START_TIMER
2335     for(y=0; y < b_h+5; y++){
2336         for(x=0; x < b_w; x++){
2337             int a0= src[x    ];
2338             int a1= src[x + 1];
2339             int a2= src[x + 2];
2340             int a3= src[x + 3];
2341             int a4= src[x + 4];
2342             int a5= src[x + 5];
2343 //            int am= 9*(a1+a2) - (a0+a3);
2344             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2345 //            int am= 18*(a2+a3) - 2*(a1+a4);
2346 //             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2347 //             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2348
2349 //            if(b_w==16) am= 8*(a1+a2);
2350
2351             if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2352             else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2353
2354             /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2355             if(am&(~255)) am= ~(am>>31);
2356
2357             tmp[x] = am;
2358
2359 /*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2360             else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2361             else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2362             else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2363         }
2364         tmp += stride;
2365         src += stride;
2366     }
2367     tmp -= (b_h+5)*stride;
2368
2369     for(y=0; y < b_h; y++){
2370         for(x=0; x < b_w; x++){
2371             int a0= tmp[x + 0*stride];
2372             int a1= tmp[x + 1*stride];
2373             int a2= tmp[x + 2*stride];
2374             int a3= tmp[x + 3*stride];
2375             int a4= tmp[x + 4*stride];
2376             int a5= tmp[x + 5*stride];
2377             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2378 //            int am= 18*(a2+a3) - 2*(a1+a4);
2379 /*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2380             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2381
2382 //            if(b_w==16) am= 8*(a1+a2);
2383
2384             if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2385             else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2386
2387             if(am&(~255)) am= ~(am>>31);
2388
2389             dst[x] = am;
2390 /*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2391             else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2392             else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2393             else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2394         }
2395         dst += stride;
2396         tmp += stride;
2397     }
2398 STOP_TIMER("mc_block")
2399 }
2400
2401 #define mca(dx,dy,b_w)\
2402 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2403     uint8_t tmp[stride*(b_w+5)];\
2404     assert(h==b_w);\
2405     mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2406 }
2407
2408 mca( 0, 0,16)
2409 mca( 8, 0,16)
2410 mca( 0, 8,16)
2411 mca( 8, 8,16)
2412 mca( 0, 0,8)
2413 mca( 8, 0,8)
2414 mca( 0, 8,8)
2415 mca( 8, 8,8)
2416
2417 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2418     if(block->type & BLOCK_INTRA){
2419         int x, y;
2420         const int color = block->color[plane_index];
2421         const int color4= color*0x01010101;
2422         if(b_w==32){
2423             for(y=0; y < b_h; y++){
2424                 *(uint32_t*)&dst[0 + y*stride]= color4;
2425                 *(uint32_t*)&dst[4 + y*stride]= color4;
2426                 *(uint32_t*)&dst[8 + y*stride]= color4;
2427                 *(uint32_t*)&dst[12+ y*stride]= color4;
2428                 *(uint32_t*)&dst[16+ y*stride]= color4;
2429                 *(uint32_t*)&dst[20+ y*stride]= color4;
2430                 *(uint32_t*)&dst[24+ y*stride]= color4;
2431                 *(uint32_t*)&dst[28+ y*stride]= color4;
2432             }
2433         }else if(b_w==16){
2434             for(y=0; y < b_h; y++){
2435                 *(uint32_t*)&dst[0 + y*stride]= color4;
2436                 *(uint32_t*)&dst[4 + y*stride]= color4;
2437                 *(uint32_t*)&dst[8 + y*stride]= color4;
2438                 *(uint32_t*)&dst[12+ y*stride]= color4;
2439             }
2440         }else if(b_w==8){
2441             for(y=0; y < b_h; y++){
2442                 *(uint32_t*)&dst[0 + y*stride]= color4;
2443                 *(uint32_t*)&dst[4 + y*stride]= color4;
2444             }
2445         }else if(b_w==4){
2446             for(y=0; y < b_h; y++){
2447                 *(uint32_t*)&dst[0 + y*stride]= color4;
2448             }
2449         }else{
2450             for(y=0; y < b_h; y++){
2451                 for(x=0; x < b_w; x++){
2452                     dst[x + y*stride]= color;
2453                 }
2454             }
2455         }
2456     }else{
2457         uint8_t *src= s->last_picture[block->ref].data[plane_index];
2458         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2459         int mx= block->mx*scale;
2460         int my= block->my*scale;
2461         const int dx= mx&15;
2462         const int dy= my&15;
2463         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2464         sx += (mx>>4) - 2;
2465         sy += (my>>4) - 2;
2466         src += sx + sy*stride;
2467         if(   (unsigned)sx >= w - b_w - 4
2468            || (unsigned)sy >= h - b_h - 4){
2469             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2470             src= tmp + MB_SIZE;
2471         }
2472 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2473 //        assert(!(b_w&(b_w-1)));
2474         assert(b_w>1 && b_h>1);
2475         assert(tab_index>=0 && tab_index<4 || b_w==32);
2476         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2477             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2478         else if(b_w==32){
2479             int y;
2480             for(y=0; y<b_h; y+=16){
2481                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2482                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2483             }
2484         }else if(b_w==b_h)
2485             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2486         else if(b_w==2*b_h){
2487             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2488             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2489         }else{
2490             assert(2*b_w==b_h);
2491             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2492             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2493         }
2494     }
2495 }
2496
2497 void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2498                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2499     int y, x;
2500     DWTELEM * dst;
2501     for(y=0; y<b_h; y++){
2502         //FIXME ugly missue of obmc_stride
2503         uint8_t *obmc1= obmc + y*obmc_stride;
2504         uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2505         uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2506         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2507         dst = slice_buffer_get_line(sb, src_y + y);
2508         for(x=0; x<b_w; x++){
2509             int v=   obmc1[x] * block[3][x + y*src_stride]
2510                     +obmc2[x] * block[2][x + y*src_stride]
2511                     +obmc3[x] * block[1][x + y*src_stride]
2512                     +obmc4[x] * block[0][x + y*src_stride];
2513
2514             v <<= 8 - LOG2_OBMC_MAX;
2515             if(FRAC_BITS != 8){
2516                 v += 1<<(7 - FRAC_BITS);
2517                 v >>= 8 - FRAC_BITS;
2518             }
2519             if(add){
2520                 v += dst[x + src_x];
2521                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2522                 if(v&(~255)) v= ~(v>>31);
2523                 dst8[x + y*src_stride] = v;
2524             }else{
2525                 dst[x + src_x] -= v;
2526             }
2527         }
2528     }
2529 }
2530
2531 //FIXME name clenup (b_w, block_w, b_width stuff)
2532 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
2533     DWTELEM * dst = NULL;
2534     const int b_width = s->b_width  << s->block_max_depth;
2535     const int b_height= s->b_height << s->block_max_depth;
2536     const int b_stride= b_width;
2537     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2538     BlockNode *rt= lt+1;
2539     BlockNode *lb= lt+b_stride;
2540     BlockNode *rb= lb+1;
2541     uint8_t *block[4];
2542     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2543     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2544     uint8_t *ptmp;
2545     int x,y;
2546
2547     if(b_x<0){
2548         lt= rt;
2549         lb= rb;
2550     }else if(b_x + 1 >= b_width){
2551         rt= lt;
2552         rb= lb;
2553     }
2554     if(b_y<0){
2555         lt= lb;
2556         rt= rb;
2557     }else if(b_y + 1 >= b_height){
2558         lb= lt;
2559         rb= rt;
2560     }
2561
2562     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2563         obmc -= src_x;
2564         b_w += src_x;
2565         src_x=0;
2566     }else if(src_x + b_w > w){
2567         b_w = w - src_x;
2568     }
2569     if(src_y<0){
2570         obmc -= src_y*obmc_stride;
2571         b_h += src_y;
2572         src_y=0;
2573     }else if(src_y + b_h> h){
2574         b_h = h - src_y;
2575     }
2576
2577     if(b_w<=0 || b_h<=0) return;
2578
2579 assert(src_stride > 2*MB_SIZE + 5);
2580 //    old_dst += src_x + src_y*dst_stride;
2581     dst8+= src_x + src_y*src_stride;
2582 //    src += src_x + src_y*src_stride;
2583
2584     ptmp= tmp + 3*tmp_step;
2585     block[0]= ptmp;
2586     ptmp+=tmp_step;
2587     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2588
2589     if(same_block(lt, rt)){
2590         block[1]= block[0];
2591     }else{
2592         block[1]= ptmp;
2593         ptmp+=tmp_step;
2594         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2595     }
2596
2597     if(same_block(lt, lb)){
2598         block[2]= block[0];
2599     }else if(same_block(rt, lb)){
2600         block[2]= block[1];
2601     }else{
2602         block[2]= ptmp;
2603         ptmp+=tmp_step;
2604         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2605     }
2606
2607     if(same_block(lt, rb) ){
2608         block[3]= block[0];
2609     }else if(same_block(rt, rb)){
2610         block[3]= block[1];
2611     }else if(same_block(lb, rb)){
2612         block[3]= block[2];
2613     }else{
2614         block[3]= ptmp;
2615         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2616     }
2617 #if 0
2618     for(y=0; y<b_h; y++){
2619         for(x=0; x<b_w; x++){
2620             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2621             if(add) dst[x + y*dst_stride] += v;
2622             else    dst[x + y*dst_stride] -= v;
2623         }
2624     }
2625     for(y=0; y<b_h; y++){
2626         uint8_t *obmc2= obmc + (obmc_stride>>1);
2627         for(x=0; x<b_w; x++){
2628             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2629             if(add) dst[x + y*dst_stride] += v;
2630             else    dst[x + y*dst_stride] -= v;
2631         }
2632     }
2633     for(y=0; y<b_h; y++){
2634         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2635         for(x=0; x<b_w; x++){
2636             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2637             if(add) dst[x + y*dst_stride] += v;
2638             else    dst[x + y*dst_stride] -= v;
2639         }
2640     }
2641     for(y=0; y<b_h; y++){
2642         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2643         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2644         for(x=0; x<b_w; x++){
2645             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2646             if(add) dst[x + y*dst_stride] += v;
2647             else    dst[x + y*dst_stride] -= v;
2648         }
2649     }
2650 #else
2651 {
2652
2653     START_TIMER
2654
2655     s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2656         STOP_TIMER("Inner add y block")
2657 }
2658 #endif
2659 }
2660
2661 //FIXME name clenup (b_w, block_w, b_width stuff)
2662 static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2663     const int b_width = s->b_width  << s->block_max_depth;
2664     const int b_height= s->b_height << s->block_max_depth;
2665     const int b_stride= b_width;
2666     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2667     BlockNode *rt= lt+1;
2668     BlockNode *lb= lt+b_stride;
2669     BlockNode *rb= lb+1;
2670     uint8_t *block[4];
2671     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2672     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2673     uint8_t *ptmp;
2674     int x,y;
2675
2676     if(b_x<0){
2677         lt= rt;
2678         lb= rb;
2679     }else if(b_x + 1 >= b_width){
2680         rt= lt;
2681         rb= lb;
2682     }
2683     if(b_y<0){
2684         lt= lb;
2685         rt= rb;
2686     }else if(b_y + 1 >= b_height){
2687         lb= lt;
2688         rb= rt;
2689     }
2690
2691     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2692         obmc -= src_x;
2693         b_w += src_x;
2694         if(!offset_dst)
2695             dst -= src_x;
2696         src_x=0;
2697     }else if(src_x + b_w > w){
2698         b_w = w - src_x;
2699     }
2700     if(src_y<0){
2701         obmc -= src_y*obmc_stride;
2702         b_h += src_y;
2703         if(!offset_dst)
2704             dst -= src_y*dst_stride;
2705         src_y=0;
2706     }else if(src_y + b_h> h){
2707         b_h = h - src_y;
2708     }
2709
2710     if(b_w<=0 || b_h<=0) return;
2711
2712 assert(src_stride > 2*MB_SIZE + 5);
2713     if(offset_dst)
2714         dst += src_x + src_y*dst_stride;
2715     dst8+= src_x + src_y*src_stride;
2716 //    src += src_x + src_y*src_stride;
2717
2718     ptmp= tmp + 3*tmp_step;
2719     block[0]= ptmp;
2720     ptmp+=tmp_step;
2721     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2722
2723     if(same_block(lt, rt)){
2724         block[1]= block[0];
2725     }else{
2726         block[1]= ptmp;
2727         ptmp+=tmp_step;
2728         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2729     }
2730
2731     if(same_block(lt, lb)){
2732         block[2]= block[0];
2733     }else if(same_block(rt, lb)){
2734         block[2]= block[1];
2735     }else{
2736         block[2]= ptmp;
2737         ptmp+=tmp_step;
2738         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2739     }
2740
2741     if(same_block(lt, rb) ){
2742         block[3]= block[0];
2743     }else if(same_block(rt, rb)){
2744         block[3]= block[1];
2745     }else if(same_block(lb, rb)){
2746         block[3]= block[2];
2747     }else{
2748         block[3]= ptmp;
2749         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2750     }
2751 #if 0
2752     for(y=0; y<b_h; y++){
2753         for(x=0; x<b_w; x++){
2754             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2755             if(add) dst[x + y*dst_stride] += v;
2756             else    dst[x + y*dst_stride] -= v;
2757         }
2758     }
2759     for(y=0; y<b_h; y++){
2760         uint8_t *obmc2= obmc + (obmc_stride>>1);
2761         for(x=0; x<b_w; x++){
2762             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2763             if(add) dst[x + y*dst_stride] += v;
2764             else    dst[x + y*dst_stride] -= v;
2765         }
2766     }
2767     for(y=0; y<b_h; y++){
2768         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2769         for(x=0; x<b_w; x++){
2770             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2771             if(add) dst[x + y*dst_stride] += v;
2772             else    dst[x + y*dst_stride] -= v;
2773         }
2774     }
2775     for(y=0; y<b_h; y++){
2776         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2777         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2778         for(x=0; x<b_w; x++){
2779             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2780             if(add) dst[x + y*dst_stride] += v;
2781             else    dst[x + y*dst_stride] -= v;
2782         }
2783     }
2784 #else
2785     for(y=0; y<b_h; y++){
2786         //FIXME ugly missue of obmc_stride
2787         uint8_t *obmc1= obmc + y*obmc_stride;
2788         uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2789         uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2790         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2791         for(x=0; x<b_w; x++){
2792             int v=   obmc1[x] * block[3][x + y*src_stride]
2793                     +obmc2[x] * block[2][x + y*src_stride]
2794                     +obmc3[x] * block[1][x + y*src_stride]
2795                     +obmc4[x] * block[0][x + y*src_stride];
2796
2797             v <<= 8 - LOG2_OBMC_MAX;
2798             if(FRAC_BITS != 8){
2799                 v += 1<<(7 - FRAC_BITS);
2800                 v >>= 8 - FRAC_BITS;
2801             }
2802             if(add){
2803                 v += dst[x + y*dst_stride];
2804                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2805                 if(v&(~255)) v= ~(v>>31);
2806                 dst8[x + y*src_stride] = v;
2807             }else{
2808                 dst[x + y*dst_stride] -= v;
2809             }
2810         }
2811     }
2812 #endif
2813 }
2814
2815 static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2816     Plane *p= &s->plane[plane_index];
2817     const int mb_w= s->b_width  << s->block_max_depth;
2818     const int mb_h= s->b_height << s->block_max_depth;
2819     int x, y, mb_x;
2820     int block_size = MB_SIZE >> s->block_max_depth;
2821     int block_w    = plane_index ? block_size/2 : block_size;
2822     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2823     int obmc_stride= plane_index ? block_size : 2*block_size;
2824     int ref_stride= s->current_picture.linesize[plane_index];
2825     uint8_t *dst8= s->current_picture.data[plane_index];
2826     int w= p->width;
2827     int h= p->height;
2828     START_TIMER
2829
2830     if(s->keyframe || (s->avctx->debug&512)){
2831         if(mb_y==mb_h)
2832             return;
2833
2834         if(add){
2835             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2836             {
2837 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2838                 DWTELEM * line = sb->line[y];
2839                 for(x=0; x<w; x++)
2840                 {
2841 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2842                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2843                     v >>= FRAC_BITS;
2844                     if(v&(~255)) v= ~(v>>31);
2845                     dst8[x + y*ref_stride]= v;
2846                 }
2847             }
2848         }else{
2849             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2850             {
2851 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2852                 DWTELEM * line = sb->line[y];
2853                 for(x=0; x<w; x++)
2854                 {
2855                     line[x] -= 128 << FRAC_BITS;
2856 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2857                 }
2858             }
2859         }
2860
2861         return;
2862     }
2863
2864         for(mb_x=0; mb_x<=mb_w; mb_x++){
2865             START_TIMER
2866
2867             add_yblock_buffered(s, sb, old_buffer, dst8, obmc,
2868                        block_w*mb_x - block_w/2,
2869                        block_w*mb_y - block_w/2,
2870                        block_w, block_w,
2871                        w, h,
2872                        w, ref_stride, obmc_stride,
2873                        mb_x - 1, mb_y - 1,
2874                        add, plane_index);
2875
2876             STOP_TIMER("add_yblock")
2877         }
2878
2879     STOP_TIMER("predict_slice")
2880 }
2881
2882 static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2883     Plane *p= &s->plane[plane_index];
2884     const int mb_w= s->b_width  << s->block_max_depth;
2885     const int mb_h= s->b_height << s->block_max_depth;
2886     int x, y, mb_x;
2887     int block_size = MB_SIZE >> s->block_max_depth;
2888     int block_w    = plane_index ? block_size/2 : block_size;
2889     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2890     const int obmc_stride= plane_index ? block_size : 2*block_size;
2891     int ref_stride= s->current_picture.linesize[plane_index];
2892     uint8_t *dst8= s->current_picture.data[plane_index];
2893     int w= p->width;
2894     int h= p->height;
2895     START_TIMER
2896
2897     if(s->keyframe || (s->avctx->debug&512)){
2898         if(mb_y==mb_h)
2899             return;
2900
2901         if(add){
2902             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2903                 for(x=0; x<w; x++){
2904                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2905                     v >>= FRAC_BITS;
2906                     if(v&(~255)) v= ~(v>>31);
2907                     dst8[x + y*ref_stride]= v;
2908                 }
2909             }
2910         }else{
2911             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2912                 for(x=0; x<w; x++){
2913                     buf[x + y*w]-= 128<<FRAC_BITS;
2914                 }
2915             }
2916         }
2917
2918         return;
2919     }
2920
2921         for(mb_x=0; mb_x<=mb_w; mb_x++){
2922             START_TIMER
2923
2924             add_yblock(s, buf, dst8, obmc,
2925                        block_w*mb_x - block_w/2,
2926                        block_w*mb_y - block_w/2,
2927                        block_w, block_w,
2928                        w, h,
2929                        w, ref_stride, obmc_stride,
2930                        mb_x - 1, mb_y - 1,
2931                        add, 1, plane_index);
2932
2933             STOP_TIMER("add_yblock")
2934         }
2935
2936     STOP_TIMER("predict_slice")
2937 }
2938
2939 static always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2940     const int mb_h= s->b_height << s->block_max_depth;
2941     int mb_y;
2942     for(mb_y=0; mb_y<=mb_h; mb_y++)
2943         predict_slice(s, buf, plane_index, add, mb_y);
2944 }
2945
2946 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2947     int i, x2, y2;
2948     Plane *p= &s->plane[plane_index];
2949     const int block_size = MB_SIZE >> s->block_max_depth;
2950     const int block_w    = plane_index ? block_size/2 : block_size;
2951     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2952     const int obmc_stride= plane_index ? block_size : 2*block_size;
2953     const int ref_stride= s->current_picture.linesize[plane_index];
2954     uint8_t *src= s-> input_picture.data[plane_index];
2955     DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2956     const int b_stride = s->b_width << s->block_max_depth;
2957     const int w= p->width;
2958     const int h= p->height;
2959     int index= mb_x + mb_y*b_stride;
2960     BlockNode *b= &s->block[index];
2961     BlockNode backup= *b;
2962     int ab=0;
2963     int aa=0;
2964
2965     b->type|= BLOCK_INTRA;
2966     b->color[plane_index]= 0;
2967     memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2968
2969     for(i=0; i<4; i++){
2970         int mb_x2= mb_x + (i &1) - 1;
2971         int mb_y2= mb_y + (i>>1) - 1;
2972         int x= block_w*mb_x2 + block_w/2;
2973         int y= block_w*mb_y2 + block_w/2;
2974
2975         add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2976                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2977
2978         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2979             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2980                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2981                 int obmc_v= obmc[index];
2982                 int d;
2983                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2984                 if(x<0) obmc_v += obmc[index + block_w];
2985                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2986                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2987                 //FIXME precalc this or simplify it somehow else
2988
2989                 d = -dst[index] + (1<<(FRAC_BITS-1));
2990                 dst[index] = d;
2991                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2992                 aa += obmc_v * obmc_v; //FIXME precalclate this
2993             }
2994         }
2995     }
2996     *b= backup;
2997
2998     return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2999 }
3000
3001 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
3002     const int b_stride = s->b_width << s->block_max_depth;
3003     const int b_height = s->b_height<< s->block_max_depth;
3004     int index= x + y*b_stride;
3005     BlockNode *b     = &s->block[index];
3006     BlockNode *left  = x ? &s->block[index-1] : &null_block;
3007     BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
3008     BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
3009     BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
3010     int dmx, dmy;
3011 //  int mx_context= av_log2(2*ABS(left->mx - top->mx));
3012 //  int my_context= av_log2(2*ABS(left->my - top->my));
3013
3014     if(x<0 || x>=b_stride || y>=b_height)
3015         return 0;
3016     dmx= b->mx - mid_pred(left->mx, top->mx, tr->mx);
3017     dmy= b->my - mid_pred(left->my, top->my, tr->my);
3018 /*
3019 1            0      0
3020 01X          1-2    1
3021 001XX        3-6    2-3
3022 0001XXX      7-14   4-7
3023 00001XXXX   15-30   8-15
3024 */
3025 //FIXME try accurate rate
3026 //FIXME intra and inter predictors if surrounding blocks arent the same type
3027     if(b->type & BLOCK_INTRA){
3028         return 3+2*( av_log2(2*ABS(left->color[0] - b->color[0]))
3029                    + av_log2(2*ABS(left->color[1] - b->color[1]))
3030                    + av_log2(2*ABS(left->color[2] - b->color[2])));
3031     }else
3032         return 2*(1 + av_log2(2*ABS(dmx)) //FIXME kill the 2* can be merged in lambda
3033                     + av_log2(2*ABS(dmy))
3034                     + av_log2(2*b->ref));
3035 }
3036
3037 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
3038     Plane *p= &s->plane[plane_index];
3039     const int block_size = MB_SIZE >> s->block_max_depth;
3040     const int block_w    = plane_index ? block_size/2 : block_size;
3041     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3042     const int obmc_stride= plane_index ? block_size : 2*block_size;
3043     const int ref_stride= s->current_picture.linesize[plane_index];
3044     uint8_t *dst= s->current_picture.data[plane_index];
3045     uint8_t *src= s->  input_picture.data[plane_index];
3046     DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
3047     uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
3048     uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
3049     const int b_stride = s->b_width << s->block_max_depth;
3050     const int b_height = s->b_height<< s->block_max_depth;
3051     const int w= p->width;
3052     const int h= p->height;
3053     int distortion;
3054     int rate= 0;
3055     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3056     int sx= block_w*mb_x - block_w/2;
3057     int sy= block_w*mb_y - block_w/2;
3058     int x0= FFMAX(0,-sx);
3059     int y0= FFMAX(0,-sy);
3060     int x1= FFMIN(block_w*2, w-sx);
3061     int y1= FFMIN(block_w*2, h-sy);
3062     int i,x,y;
3063
3064     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
3065
3066     for(y=y0; y<y1; y++){
3067         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
3068         const DWTELEM *pred1 = pred + y*obmc_stride;
3069         uint8_t *cur1 = cur + y*ref_stride;
3070         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
3071         for(x=x0; x<x1; x++){
3072             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
3073             v = (v + pred1[x]) >> FRAC_BITS;
3074             if(v&(~255)) v= ~(v>>31);
3075             dst1[x] = v;
3076         }
3077     }
3078
3079     /* copy the regions where obmc[] = (uint8_t)256 */
3080     if(LOG2_OBMC_MAX == 8
3081         && (mb_x == 0 || mb_x == b_stride-1)
3082         && (mb_y == 0 || mb_y == b_height-1)){
3083         if(mb_x == 0)
3084             x1 = block_w;
3085         else
3086             x0 = block_w;
3087         if(mb_y == 0)
3088             y1 = block_w;
3089         else
3090             y0 = block_w;
3091         for(y=y0; y<y1; y++)
3092             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
3093     }
3094
3095     //FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
3096     if(block_w==16){
3097         distortion = 0;
3098         for(i=0; i<4; i++){
3099             int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3100             distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3101         }
3102     }else{
3103         assert(block_w==8);
3104         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3105     }
3106
3107     if(plane_index==0){
3108         for(i=0; i<4; i++){
3109 /* ..RRr
3110  * .RXx.
3111  * rxx..
3112  */
3113             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3114         }
3115         if(mb_x == b_stride-2)
3116             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3117     }
3118     return distortion + rate*penalty_factor;
3119 }
3120
3121 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3122     int i, y2;
3123     Plane *p= &s->plane[plane_index];
3124     const int block_size = MB_SIZE >> s->block_max_depth;
3125     const int block_w    = plane_index ? block_size/2 : block_size;
3126     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3127     const int obmc_stride= plane_index ? block_size : 2*block_size;
3128     const int ref_stride= s->current_picture.linesize[plane_index];
3129     uint8_t *dst= s->current_picture.data[plane_index];
3130     uint8_t *src= s-> input_picture.data[plane_index];
3131     const static DWTELEM zero_dst[4096]; //FIXME
3132     const int b_stride = s->b_width << s->block_max_depth;
3133     const int b_height = s->b_height<< s->block_max_depth;
3134     const int w= p->width;
3135     const int h= p->height;
3136     int distortion= 0;
3137     int rate= 0;
3138     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3139
3140     for(i=0; i<9; i++){
3141         int mb_x2= mb_x + (i%3) - 1;
3142         int mb_y2= mb_y + (i/3) - 1;
3143         int x= block_w*mb_x2 + block_w/2;
3144         int y= block_w*mb_y2 + block_w/2;
3145
3146         add_yblock(s, zero_dst, dst, obmc,
3147                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3148
3149         //FIXME find a cleaner/simpler way to skip the outside stuff
3150         for(y2= y; y2<0; y2++)
3151             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3152         for(y2= h; y2<y+block_w; y2++)
3153             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3154         if(x<0){
3155             for(y2= y; y2<y+block_w; y2++)
3156                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3157         }
3158         if(x+block_w > w){
3159             for(y2= y; y2<y+block_w; y2++)
3160                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3161         }
3162
3163         assert(block_w== 8 || block_w==16);
3164         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3165     }
3166
3167     if(plane_index==0){
3168         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3169         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3170
3171 /* ..RRRr
3172  * .RXXx.
3173  * .RXXx.
3174  * rxxx.
3175  */
3176         if(merged)
3177             rate = get_block_bits(s, mb_x, mb_y, 2);
3178         for(i=merged?4:0; i<9; i++){
3179             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3180             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3181         }
3182     }
3183     return distortion + rate*penalty_factor;
3184 }
3185
3186 static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3187     const int b_stride= s->b_width << s->block_max_depth;
3188     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3189     BlockNode backup= *block;
3190     int rd, index, value;
3191
3192     assert(mb_x>=0 && mb_y>=0);
3193     assert(mb_x<b_stride);
3194
3195     if(intra){
3196         block->color[0] = p[0];
3197         block->color[1] = p[1];
3198         block->color[2] = p[2];
3199         block->type |= BLOCK_INTRA;
3200     }else{
3201         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3202         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3203         if(s->me_cache[index] == value)
3204             return 0;
3205         s->me_cache[index]= value;
3206
3207         block->mx= p[0];
3208         block->my= p[1];
3209         block->type &= ~BLOCK_INTRA;
3210     }
3211
3212     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3213
3214 //FIXME chroma
3215     if(rd < *best_rd){
3216         *best_rd= rd;
3217         return 1;
3218     }else{
3219         *block= backup;
3220         return 0;
3221     }
3222 }
3223
3224 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3225 static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3226     int p[2] = {p0, p1};
3227     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3228 }
3229
3230 static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3231     const int b_stride= s->b_width << s->block_max_depth;
3232     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3233     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3234     int rd, index, value;
3235
3236     assert(mb_x>=0 && mb_y>=0);
3237     assert(mb_x<b_stride);
3238     assert(((mb_x|mb_y)&1) == 0);
3239
3240     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3241     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3242     if(s->me_cache[index] == value)
3243         return 0;
3244     s->me_cache[index]= value;
3245
3246     block->mx= p0;
3247     block->my= p1;
3248     block->ref= ref;
3249     block->type &= ~BLOCK_INTRA;
3250     block[1]= block[b_stride]= block[b_stride+1]= *block;
3251
3252     rd= get_4block_rd(s, mb_x, mb_y, 0);
3253
3254 //FIXME chroma
3255     if(rd < *best_rd){
3256         *best_rd= rd;
3257         return 1;
3258     }else{
3259         block[0]= backup[0];
3260         block[1]= backup[1];
3261         block[b_stride]= backup[2];
3262         block[b_stride+1]= backup[3];
3263         return 0;
3264     }
3265 }
3266
3267 static void iterative_me(SnowContext *s){
3268     int pass, mb_x, mb_y;
3269     const int b_width = s->b_width  << s->block_max_depth;
3270     const int b_height= s->b_height << s->block_max_depth;
3271     const int b_stride= b_width;
3272     int color[3];
3273
3274     {
3275         RangeCoder r = s->c;
3276         uint8_t state[sizeof(s->block_state)];
3277         memcpy(state, s->block_state, sizeof(s->block_state));
3278         for(mb_y= 0; mb_y<s->b_height; mb_y++)
3279             for(mb_x= 0; mb_x<s->b_width; mb_x++)
3280                 encode_q_branch(s, 0, mb_x, mb_y);
3281         s->c = r;
3282         memcpy(s->block_state, state, sizeof(s->block_state));
3283     }
3284
3285     for(pass=0; pass<50; pass++){
3286         int change= 0;
3287
3288         for(mb_y= 0; mb_y<b_height; mb_y++){
3289             for(mb_x= 0; mb_x<b_width; mb_x++){
3290                 int dia_change, i, j, ref;
3291                 int best_rd= INT_MAX, ref_rd;
3292                 BlockNode backup, ref_b;
3293                 const int index= mb_x + mb_y * b_stride;
3294                 BlockNode *block= &s->block[index];
3295                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : &null_block;
3296                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : &null_block;
3297                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : &null_block;
3298                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : &null_block;
3299                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : &null_block;
3300                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : &null_block;
3301                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : &null_block;
3302                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : &null_block;
3303                 const int b_w= (MB_SIZE >> s->block_max_depth);
3304                 uint8_t obmc_edged[b_w*2][b_w*2];
3305
3306                 if(pass && (block->type & BLOCK_OPT))
3307                     continue;
3308                 block->type |= BLOCK_OPT;
3309
3310                 backup= *block;
3311
3312                 if(!s->me_cache_generation)
3313                     memset(s->me_cache, 0, sizeof(s->me_cache));
3314                 s->me_cache_generation += 1<<22;
3315
3316                 //FIXME precalc
3317                 {
3318                     int x, y;
3319                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3320                     if(mb_x==0)
3321                         for(y=0; y<b_w*2; y++)
3322                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3323                     if(mb_x==b_stride-1)
3324                         for(y=0; y<b_w*2; y++)
3325                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3326                     if(mb_y==0){
3327                         for(x=0; x<b_w*2; x++)
3328                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
3329                         for(y=1; y<b_w; y++)
3330                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3331                     }
3332                     if(mb_y==b_height-1){
3333                         for(x=0; x<b_w*2; x++)
3334                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3335                         for(y=b_w; y<b_w*2-1; y++)
3336                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3337                     }
3338                 }
3339
3340                 //skip stuff outside the picture
3341                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3342                 {
3343                     uint8_t *src= s->  input_picture.data[0];
3344                     uint8_t *dst= s->current_picture.data[0];
3345                     const int stride= s->current_picture.linesize[0];
3346                     const int block_w= MB_SIZE >> s->block_max_depth;
3347                     const int sx= block_w*mb_x - block_w/2;
3348                     const int sy= block_w*mb_y - block_w/2;
3349                     const int w= s->plane[0].width;
3350                     const int h= s->plane[0].height;
3351                     int y;
3352
3353                     for(y=sy; y<0; y++)
3354                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3355                     for(y=h; y<sy+block_w*2; y++)
3356                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3357                     if(sx<0){
3358                         for(y=sy; y<sy+block_w*2; y++)
3359                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3360                     }
3361                     if(sx+block_w*2 > w){
3362                         for(y=sy; y<sy+block_w*2; y++)
3363                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3364                     }
3365                 }
3366
3367                 // intra(black) = neighbors' contribution to the current block
3368                 for(i=0; i<3; i++)
3369                     color[i]= get_dc(s, mb_x, mb_y, i);
3370
3371                 // get previous score (cant be cached due to OBMC)
3372                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3373                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3374                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3375                 }else
3376                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3377
3378                 ref_b= *block;
3379                 ref_rd= best_rd;
3380                 for(ref=0; ref < s->ref_frames; ref++){
3381                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3382                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3383                         continue;
3384                     block->ref= ref;
3385                     best_rd= INT_MAX;
3386
3387                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3388                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3389                     if(tb!=&null_block)
3390                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3391                     if(lb!=&null_block)
3392                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3393                     if(rb!=&null_block)
3394                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3395                     if(bb!=&null_block)
3396                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3397
3398                     /* fullpel ME */
3399                     //FIXME avoid subpel interpol / round to nearest integer
3400                     do{
3401                         dia_change=0;
3402                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3403                             for(j=0; j<i; j++){
3404                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3405                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3406                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3407                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3408                             }
3409                         }
3410                     }while(dia_change);
3411                     /* subpel ME */
3412                     do{
3413                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3414                         dia_change=0;
3415                         for(i=0; i<8; i++)
3416                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3417                     }while(dia_change);
3418                     //FIXME or try the standard 2 pass qpel or similar
3419
3420                     mvr[0][0]= block->mx;
3421                     mvr[0][1]= block->my;
3422                     if(ref_rd > best_rd){
3423                         ref_rd= best_rd;
3424                         ref_b= *block;
3425                     }
3426                 }
3427                 best_rd= ref_rd;
3428                 *block= ref_b;
3429 #if 1
3430                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3431                 //FIXME RD style color selection
3432 #endif
3433                 if(!same_block(block, &backup)){
3434                     if(tb != &null_block) tb ->type &= ~BLOCK_OPT;
3435                     if(lb != &null_block) lb ->type &= ~BLOCK_OPT;
3436                     if(rb != &null_block) rb ->type &= ~BLOCK_OPT;
3437                     if(bb != &null_block) bb ->type &= ~BLOCK_OPT;
3438                     if(tlb!= &null_block) tlb->type &= ~BLOCK_OPT;
3439                     if(trb!= &null_block) trb->type &= ~BLOCK_OPT;
3440                     if(blb!= &null_block) blb->type &= ~BLOCK_OPT;
3441                     if(brb!= &null_block) brb->type &= ~BLOCK_OPT;
3442                     change ++;
3443                 }
3444             }
3445         }
3446         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3447         if(!change)
3448             break;
3449     }
3450
3451     if(s->block_max_depth == 1){
3452         int change= 0;
3453         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3454             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3455                 int dia_change, i, j;
3456                 int best_rd, init_rd;
3457                 const int index= mb_x + mb_y * b_stride;
3458                 BlockNode *b[4];
3459
3460                 b[0]= &s->block[index];
3461                 b[1]= b[0]+1;
3462                 b[2]= b[0]+b_stride;
3463                 b[3]= b[2]+1;
3464                 if(same_block(b[0], b[1]) &&
3465                    same_block(b[0], b[2]) &&
3466                    same_block(b[0], b[3]))
3467                     continue;
3468
3469                 if(!s->me_cache_generation)
3470                     memset(s->me_cache, 0, sizeof(s->me_cache));
3471                 s->me_cache_generation += 1<<22;
3472
3473                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3474
3475                 //FIXME more multiref search?
3476                 check_4block_inter(s, mb_x, mb_y,
3477                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3478                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3479
3480                 for(i=0; i<4; i++)
3481                     if(!(b[i]->type&BLOCK_INTRA))
3482                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3483
3484                 if(init_rd != best_rd)
3485                     change++;
3486             }
3487         }
3488         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3489     }
3490 }
3491
3492 static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3493     const int level= b->level;
3494     const int w= b->width;
3495     const int h= b->height;
3496     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3497     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3498     int x,y, thres1, thres2;
3499 //    START_TIMER
3500
3501     if(s->qlog == LOSSLESS_QLOG) return;
3502
3503     bias= bias ? 0 : (3*qmul)>>3;
3504     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3505     thres2= 2*thres1;
3506
3507     if(!bias){
3508         for(y=0; y<h; y++){
3509             for(x=0; x<w; x++){
3510                 int i= src[x + y*stride];
3511
3512                 if((unsigned)(i+thres1) > thres2){
3513                     if(i>=0){
3514                         i<<= QEXPSHIFT;
3515                         i/= qmul; //FIXME optimize
3516                         src[x + y*stride]=  i;
3517                     }else{
3518                         i= -i;
3519                         i<<= QEXPSHIFT;
3520                         i/= qmul; //FIXME optimize
3521                         src[x + y*stride]= -i;
3522                     }
3523                 }else
3524                     src[x + y*stride]= 0;
3525             }
3526         }
3527     }else{
3528         for(y=0; y<h; y++){
3529             for(x=0; x<w; x++){
3530                 int i= src[x + y*stride];
3531
3532                 if((unsigned)(i+thres1) > thres2){
3533                     if(i>=0){
3534                         i<<= QEXPSHIFT;
3535                         i= (i + bias) / qmul; //FIXME optimize
3536                         src[x + y*stride]=  i;
3537                     }else{
3538                         i= -i;
3539                         i<<= QEXPSHIFT;
3540                         i= (i + bias) / qmul; //FIXME optimize
3541                         src[x + y*stride]= -i;
3542                     }
3543                 }else
3544                     src[x + y*stride]= 0;
3545             }
3546         }
3547     }
3548     if(level+1 == s->spatial_decomposition_count){
3549 //        STOP_TIMER("quantize")
3550     }
3551 }
3552
3553 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3554     const int w= b->width;
3555     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3556     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3557     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3558     int x,y;
3559     START_TIMER
3560
3561     if(s->qlog == LOSSLESS_QLOG) return;
3562
3563     for(y=start_y; y<end_y; y++){
3564 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3565         DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3566         for(x=0; x<w; x++){
3567             int i= line[x];
3568             if(i<0){
3569                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3570             }else if(i>0){
3571                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3572             }
3573         }
3574     }
3575     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3576         STOP_TIMER("dquant")
3577     }
3578 }
3579
3580 static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3581     const int w= b->width;
3582     const int h= b->height;
3583     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3584     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3585     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3586     int x,y;
3587     START_TIMER
3588
3589     if(s->qlog == LOSSLESS_QLOG) return;
3590
3591     for(y=0; y<h; y++){
3592         for(x=0; x<w; x++){
3593             int i= src[x + y*stride];
3594             if(i<0){
3595                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3596             }else if(i>0){
3597                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3598             }
3599         }
3600     }
3601     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3602         STOP_TIMER("dquant")
3603     }
3604 }
3605
3606 static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3607     const int w= b->width;
3608     const int h= b->height;
3609     int x,y;
3610
3611     for(y=h-1; y>=0; y--){
3612         for(x=w-1; x>=0; x--){
3613             int i= x + y*stride;
3614
3615             if(x){
3616                 if(use_median){
3617                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3618                     else  src[i] -= src[i - 1];
3619                 }else{
3620                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3621                     else  src[i] -= src[i - 1];
3622                 }
3623             }else{
3624                 if(y) src[i] -= src[i - stride];
3625             }
3626         }
3627     }
3628 }
3629
3630 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3631     const int w= b->width;
3632     int x,y;
3633
3634 //    START_TIMER
3635
3636     DWTELEM * line;
3637     DWTELEM * prev;
3638
3639     if (start_y != 0)
3640         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3641
3642     for(y=start_y; y<end_y; y++){
3643         prev = line;
3644 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3645         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3646         for(x=0; x<w; x++){
3647             if(x){
3648                 if(use_median){
3649                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3650                     else  line[x] += line[x - 1];
3651                 }else{
3652                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3653                     else  line[x] += line[x - 1];
3654                 }
3655             }else{
3656                 if(y) line[x] += prev[x];
3657             }
3658         }
3659     }
3660
3661 //    STOP_TIMER("correlate")
3662 }
3663
3664 static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3665     const int w= b->width;
3666     const int h= b->height;
3667     int x,y;
3668
3669     for(y=0; y<h; y++){
3670         for(x=0; x<w; x++){
3671             int i= x + y*stride;
3672
3673             if(x){
3674                 if(use_median){
3675                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3676                     else  src[i] += src[i - 1];
3677                 }else{
3678                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3679                     else  src[i] += src[i - 1];
3680                 }
3681             }else{
3682                 if(y) src[i] += src[i - stride];
3683             }
3684         }
3685     }
3686 }
3687
3688 static void encode_header(SnowContext *s){
3689     int plane_index, level, orientation;
3690     uint8_t kstate[32];
3691
3692     memset(kstate, MID_STATE, sizeof(kstate));
3693
3694     put_rac(&s->c, kstate, s->keyframe);
3695     if(s->keyframe || s->always_reset)
3696         reset_contexts(s);
3697     if(s->keyframe){
3698         put_symbol(&s->c, s->header_state, s->version, 0);
3699         put_rac(&s->c, s->header_state, s->always_reset);
3700         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3701         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3702         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3703         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3704         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3705         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3706         put_rac(&s->c, s->header_state, s->spatial_scalability);
3707 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3708         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3709
3710         for(plane_index=0; plane_index<2; plane_index++){
3711             for(level=0; level<s->spatial_decomposition_count; level++){
3712                 for(orientation=level ? 1:0; orientation<4; orientation++){
3713                     if(orientation==2) continue;
3714                     put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3715                 }
3716             }
3717         }
3718     }
3719     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type, 0);
3720     put_symbol(&s->c, s->header_state, s->qlog, 1);
3721     put_symbol(&s->c, s->header_state, s->mv_scale, 0);
3722     put_symbol(&s->c, s->header_state, s->qbias, 1);
3723     put_symbol(&s->c, s->header_state, s->block_max_depth, 0);
3724 }
3725
3726 static int decode_header(SnowContext *s){
3727     int plane_index, level, orientation;
3728     uint8_t kstate[32];
3729
3730     memset(kstate, MID_STATE, sizeof(kstate));
3731
3732     s->keyframe= get_rac(&s->c, kstate);
3733     if(s->keyframe || s->always_reset)
3734         reset_contexts(s);
3735     if(s->keyframe){
3736         s->version= get_symbol(&s->c, s->header_state, 0);
3737         if(s->version>0){
3738             av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3739             return -1;
3740         }
3741         s->always_reset= get_rac(&s->c, s->header_state);
3742         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3743         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3744         s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3745         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3746         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3747         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3748         s->spatial_scalability= get_rac(&s->c, s->header_state);
3749 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3750         s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3751
3752         for(plane_index=0; plane_index<3; plane_index++){
3753             for(level=0; level<s->spatial_decomposition_count; level++){
3754                 for(orientation=level ? 1:0; orientation<4; orientation++){
3755                     int q;
3756                     if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3757                     else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3758                     else                    q= get_symbol(&s->c, s->header_state, 1);
3759                     s->plane[plane_index].band[level][orientation].qlog= q;
3760                 }
3761             }
3762         }
3763     }
3764
3765     s->spatial_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3766     if(s->spatial_decomposition_type > 2){
3767         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3768         return -1;
3769     }
3770
3771     s->qlog= get_symbol(&s->c, s->header_state, 1);
3772     s->mv_scale= get_symbol(&s->c, s->header_state, 0);
3773     s->qbias= get_symbol(&s->c, s->header_state, 1);
3774     s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
3775     if(s->block_max_depth > 1 || s->block_max_depth < 0){
3776         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3777         s->block_max_depth= 0;
3778         return -1;
3779     }
3780
3781     return 0;
3782 }
3783
3784 static void init_qexp(void){
3785     int i;
3786     double v=128;
3787
3788     for(i=0; i<QROOT; i++){
3789         qexp[i]= lrintf(v);
3790         v *= pow(2, 1.0 / QROOT);
3791     }
3792 }
3793
3794 static int common_init(AVCodecContext *avctx){
3795     SnowContext *s = avctx->priv_data;
3796     int width, height;
3797     int level, orientation, plane_index, dec;
3798
3799     s->avctx= avctx;
3800
3801     dsputil_init(&s->dsp, avctx);
3802
3803 #define mcf(dx,dy)\
3804     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3805     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3806         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3807     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3808     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3809         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3810
3811     mcf( 0, 0)
3812     mcf( 4, 0)
3813     mcf( 8, 0)
3814     mcf(12, 0)
3815     mcf( 0, 4)
3816     mcf( 4, 4)
3817     mcf( 8, 4)
3818     mcf(12, 4)
3819     mcf( 0, 8)
3820     mcf( 4, 8)
3821     mcf( 8, 8)
3822     mcf(12, 8)
3823     mcf( 0,12)
3824     mcf( 4,12)
3825     mcf( 8,12)
3826     mcf(12,12)
3827
3828 #define mcfh(dx,dy)\
3829     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3830     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3831         mc_block_hpel ## dx ## dy ## 16;\
3832     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3833     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3834         mc_block_hpel ## dx ## dy ## 8;
3835
3836     mcfh(0, 0)
3837     mcfh(8, 0)
3838     mcfh(0, 8)
3839     mcfh(8, 8)
3840
3841     if(!qexp[0])
3842         init_qexp();
3843
3844     dec= s->spatial_decomposition_count= 5;
3845     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3846
3847     s->chroma_h_shift= 1; //FIXME XXX
3848     s->chroma_v_shift= 1;
3849
3850 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3851
3852     width= s->avctx->width;
3853     height= s->avctx->height;
3854
3855     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3856
3857     s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3858     s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3859
3860     for(plane_index=0; plane_index<3; plane_index++){
3861         int w= s->avctx->width;
3862         int h= s->avctx->height;
3863
3864         if(plane_index){
3865             w>>= s->chroma_h_shift;
3866             h>>= s->chroma_v_shift;
3867         }
3868         s->plane[plane_index].width = w;
3869         s->plane[plane_index].height= h;
3870 //av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3871         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3872             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3873                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3874
3875                 b->buf= s->spatial_dwt_buffer;
3876                 b->level= level;
3877                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3878                 b->width = (w + !(orientation&1))>>1;
3879                 b->height= (h + !(orientation>1))>>1;
3880
3881                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3882                 b->buf_x_offset = 0;
3883                 b->buf_y_offset = 0;
3884
3885                 if(orientation&1){
3886                     b->buf += (w+1)>>1;
3887                     b->buf_x_offset = (w+1)>>1;
3888                 }
3889                 if(orientation>1){
3890                     b->buf += b->stride>>1;
3891                     b->buf_y_offset = b->stride_line >> 1;
3892                 }
3893
3894                 if(level)
3895                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3896                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3897             }
3898             w= (w+1)>>1;
3899             h= (h+1)>>1;
3900         }
3901     }
3902
3903     reset_contexts(s);
3904 /*
3905     width= s->width= avctx->width;
3906     height= s->height= avctx->height;
3907
3908     assert(width && height);
3909 */
3910     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3911
3912     return 0;
3913 }
3914
3915 static void ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3916 {
3917     /* estimate the frame's complexity as a sum of weighted dwt coefs.
3918      * FIXME we know exact mv bits at this point,
3919      * but ratecontrol isn't set up to include them. */
3920     uint32_t coef_sum= 0;
3921     int level, orientation;
3922
3923     for(level=0; level<s->spatial_decomposition_count; level++){
3924         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3925             SubBand *b= &s->plane[0].band[level][orientation];
3926             DWTELEM *buf= b->buf;
3927             const int w= b->width;
3928             const int h= b->height;
3929             const int stride= b->stride;
3930             const int qlog= clip(2*QROOT + b->qlog, 0, QROOT*16);
3931             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3932             const int qdiv= (1<<16)/qmul;
3933             int x, y;
3934             if(orientation==0)
3935                 decorrelate(s, b, buf, stride, 1, 0);
3936             for(y=0; y<h; y++)
3937                 for(x=0; x<w; x++)
3938                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3939             if(orientation==0)
3940                 correlate(s, b, buf, stride, 1, 0);
3941         }
3942     }
3943
3944     /* ugly, ratecontrol just takes a sqrt again */
3945     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3946     assert(coef_sum < INT_MAX);
3947
3948     if(pict->pict_type == I_TYPE){
3949         s->m.current_picture.mb_var_sum= coef_sum;
3950         s->m.current_picture.mc_mb_var_sum= 0;
3951     }else{
3952         s->m.current_picture.mc_mb_var_sum= coef_sum;
3953         s->m.current_picture.mb_var_sum= 0;
3954     }
3955
3956     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3957     s->lambda= pict->quality * 3/2;
3958 }
3959
3960 static void calculate_vissual_weight(SnowContext *s, Plane *p){
3961     int width = p->width;
3962     int height= p->height;
3963     int level, orientation, x, y;
3964
3965     for(level=0; level<s->spatial_decomposition_count; level++){
3966         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3967             SubBand *b= &p->band[level][orientation];
3968             DWTELEM *buf= b->buf;
3969             int64_t error=0;
3970
3971             memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3972             buf[b->width/2 + b->height/2*b->stride]= 256*256;
3973             ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3974             for(y=0; y<height; y++){
3975                 for(x=0; x<width; x++){
3976                     int64_t d= s->spatial_dwt_buffer[x + y*width];
3977                     error += d*d;
3978                 }
3979             }
3980
3981             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3982 //            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3983         }
3984     }
3985 }
3986
3987 static int encode_init(AVCodecContext *avctx)
3988 {
3989     SnowContext *s = avctx->priv_data;
3990     int plane_index;
3991
3992     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3993         av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3994                "use vstrict=-2 / -strict -2 to use it anyway\n");
3995         return -1;
3996     }
3997
3998     common_init(avctx);
3999     alloc_blocks(s);
4000
4001     s->version=0;
4002
4003     s->m.avctx   = avctx;
4004     s->m.flags   = avctx->flags;
4005     s->m.bit_rate= avctx->bit_rate;
4006
4007     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
4008     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4009     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4010     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
4011     h263_encode_init(&s->m); //mv_penalty
4012
4013     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
4014
4015     if(avctx->flags&CODEC_FLAG_PASS1){
4016         if(!avctx->stats_out)
4017             avctx->stats_out = av_mallocz(256);
4018     }
4019     if(!(avctx->flags&CODEC_FLAG_QSCALE)){
4020         if(ff_rate_control_init(&s->m) < 0)
4021             return -1;
4022     }
4023     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
4024
4025     for(plane_index=0; plane_index<3; plane_index++){
4026         calculate_vissual_weight(s, &s->plane[plane_index]);
4027     }
4028
4029
4030     avctx->coded_frame= &s->current_picture;
4031     switch(avctx->pix_fmt){
4032 //    case PIX_FMT_YUV444P:
4033 //    case PIX_FMT_YUV422P:
4034     case PIX_FMT_YUV420P:
4035     case PIX_FMT_GRAY8:
4036 //    case PIX_FMT_YUV411P:
4037 //    case PIX_FMT_YUV410P:
4038         s->colorspace_type= 0;
4039         break;
4040 /*    case PIX_FMT_RGBA32:
4041         s->colorspace= 1;
4042         break;*/
4043     default:
4044         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
4045         return -1;
4046     }
4047 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4048     s->chroma_h_shift= 1;
4049     s->chroma_v_shift= 1;
4050
4051     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4052     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4053
4054     s->avctx->get_buffer(s->avctx, &s->input_picture);
4055
4056     if(s->avctx->me_method == ME_ITER){
4057         int i;
4058         int size= s->b_width * s->b_height << 2*s->block_max_depth;
4059         for(i=0; i<s->max_ref_frames; i++){
4060             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4061             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4062         }
4063     }
4064
4065     return 0;
4066 }
4067
4068 static int frame_start(SnowContext *s){
4069    AVFrame tmp;
4070    int w= s->avctx->width; //FIXME round up to x16 ?
4071    int h= s->avctx->height;
4072
4073     if(s->current_picture.data[0]){
4074         draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4075         draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4076         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4077     }
4078
4079     tmp= s->last_picture[s->max_ref_frames-1];
4080     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4081     s->last_picture[0]= s->current_picture;
4082     s->current_picture= tmp;
4083
4084     if(s->keyframe){
4085         s->ref_frames= 0;
4086     }else{
4087         int i;
4088         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4089             if(i && s->last_picture[i-1].key_frame)
4090                 break;
4091         s->ref_frames= i;
4092     }
4093
4094     s->current_picture.reference= 1;
4095     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4096         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4097         return -1;
4098     }
4099
4100     s->current_picture.key_frame= s->keyframe;
4101
4102     return 0;
4103 }
4104
4105 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4106     SnowContext *s = avctx->priv_data;
4107     RangeCoder * const c= &s->c;
4108     AVFrame *pict = data;
4109     const int width= s->avctx->width;
4110     const int height= s->avctx->height;
4111     int level, orientation, plane_index, i, y;
4112
4113     ff_init_range_encoder(c, buf, buf_size);
4114     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4115
4116     for(i=0; i<3; i++){
4117         int shift= !!i;
4118         for(y=0; y<(height>>shift); y++)
4119             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
4120                    &pict->data[i][y * pict->linesize[i]],
4121                    width>>shift);
4122     }
4123     s->new_picture = *pict;
4124
4125     s->m.picture_number= avctx->frame_number;
4126     if(avctx->flags&CODEC_FLAG_PASS2){
4127         s->m.pict_type =
4128         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
4129         s->keyframe= pict->pict_type==FF_I_TYPE;
4130         if(!(avctx->flags&CODEC_FLAG_QSCALE))
4131             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4132     }else{
4133         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4134         s->m.pict_type=
4135         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4136     }
4137
4138     if(s->pass1_rc && avctx->frame_number == 0)
4139         pict->quality= 2*FF_QP2LAMBDA;
4140     if(pict->quality){
4141         s->qlog= rint(QROOT*log(pict->quality / (float)FF_QP2LAMBDA)/log(2));
4142         //<64 >60
4143         s->qlog += 61*QROOT/8;
4144         s->lambda = pict->quality * 3/2;
4145     }
4146     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
4147         s->qlog= LOSSLESS_QLOG;
4148         s->lambda = 0;
4149     }//else keep previous frame's qlog until after motion est
4150
4151     frame_start(s);
4152
4153     s->m.current_picture_ptr= &s->m.current_picture;
4154     if(pict->pict_type == P_TYPE){
4155         int block_width = (width +15)>>4;
4156         int block_height= (height+15)>>4;
4157         int stride= s->current_picture.linesize[0];
4158
4159         assert(s->current_picture.data[0]);
4160         assert(s->last_picture[0].data[0]);
4161
4162         s->m.avctx= s->avctx;
4163         s->m.current_picture.data[0]= s->current_picture.data[0];
4164         s->m.   last_picture.data[0]= s->last_picture[0].data[0];
4165         s->m.    new_picture.data[0]= s->  input_picture.data[0];
4166         s->m.   last_picture_ptr= &s->m.   last_picture;
4167         s->m.linesize=
4168         s->m.   last_picture.linesize[0]=
4169         s->m.    new_picture.linesize[0]=
4170         s->m.current_picture.linesize[0]= stride;
4171         s->m.uvlinesize= s->current_picture.linesize[1];
4172         s->m.width = width;
4173         s->m.height= height;
4174         s->m.mb_width = block_width;
4175         s->m.mb_height= block_height;
4176         s->m.mb_stride=   s->m.mb_width+1;
4177         s->m.b8_stride= 2*s->m.mb_width+1;
4178         s->m.f_code=1;
4179         s->m.pict_type= pict->pict_type;
4180         s->m.me_method= s->avctx->me_method;
4181         s->m.me.scene_change_score=0;
4182         s->m.flags= s->avctx->flags;
4183         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
4184         s->m.out_format= FMT_H263;
4185         s->m.unrestricted_mv= 1;
4186
4187         s->m.lambda = s->lambda;
4188         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
4189         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
4190
4191         s->m.dsp= s->dsp; //move
4192         ff_init_me(&s->m);
4193         s->dsp= s->m.dsp;
4194     }
4195
4196 redo_frame:
4197
4198     s->m.pict_type = pict->pict_type;
4199     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
4200
4201     encode_header(s);
4202     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4203     encode_blocks(s);
4204     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
4205
4206     for(plane_index=0; plane_index<3; plane_index++){
4207         Plane *p= &s->plane[plane_index];
4208         int w= p->width;
4209         int h= p->height;
4210         int x, y;
4211 //        int bits= put_bits_count(&s->c.pb);
4212
4213         //FIXME optimize
4214      if(pict->data[plane_index]) //FIXME gray hack
4215         for(y=0; y<h; y++){
4216             for(x=0; x<w; x++){
4217                 s->spatial_dwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
4218             }
4219         }
4220         predict_plane(s, s->spatial_dwt_buffer, plane_index, 0);
4221
4222         if(   plane_index==0
4223            && pict->pict_type == P_TYPE
4224            && !(avctx->flags&CODEC_FLAG_PASS2)
4225            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
4226             ff_init_range_encoder(c, buf, buf_size);
4227             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4228             pict->pict_type= FF_I_TYPE;
4229             s->keyframe=1;
4230             s->current_picture.key_frame=1;
4231             reset_contexts(s);
4232             goto redo_frame;
4233         }
4234
4235         if(s->qlog == LOSSLESS_QLOG){
4236             for(y=0; y<h; y++){
4237                 for(x=0; x<w; x++){
4238                     s->spatial_dwt_buffer[y*w + x]= (s->spatial_dwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
4239                 }
4240             }
4241         }
4242
4243         ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4244
4245         if(s->pass1_rc && plane_index==0)
4246             ratecontrol_1pass(s, pict);
4247
4248         for(level=0; level<s->spatial_decomposition_count; level++){
4249             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4250                 SubBand *b= &p->band[level][orientation];
4251
4252                 quantize(s, b, b->buf, b->stride, s->qbias);
4253                 if(orientation==0)
4254                     decorrelate(s, b, b->buf, b->stride, pict->pict_type == P_TYPE, 0);
4255                 encode_subband(s, b, b->buf, b->parent ? b->parent->buf : NULL, b->stride, orientation);
4256                 assert(b->parent==NULL || b->parent->stride == b->stride*2);
4257                 if(orientation==0)
4258                     correlate(s, b, b->buf, b->stride, 1, 0);
4259             }
4260         }
4261 //        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
4262
4263         for(level=0; level<s->spatial_decomposition_count; level++){
4264             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4265                 SubBand *b= &p->band[level][orientation];
4266
4267                 dequantize(s, b, b->buf, b->stride);
4268             }
4269         }
4270
4271         ff_spatial_idwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4272         if(s->qlog == LOSSLESS_QLOG){
4273             for(y=0; y<h; y++){
4274                 for(x=0; x<w; x++){
4275                     s->spatial_dwt_buffer[y*w + x]<<=FRAC_BITS;
4276                 }
4277             }
4278         }
4279 {START_TIMER
4280         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4281 STOP_TIMER("pred-conv")}
4282         if(s->avctx->flags&CODEC_FLAG_PSNR){
4283             int64_t error= 0;
4284
4285     if(pict->data[plane_index]) //FIXME gray hack
4286             for(y=0; y<h; y++){
4287                 for(x=0; x<w; x++){
4288                     int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4289                     error += d*d;
4290                 }
4291             }
4292             s->avctx->error[plane_index] += error;
4293             s->current_picture.error[plane_index] = error;
4294         }
4295     }
4296
4297     if(s->last_picture[s->max_ref_frames-1].data[0])
4298         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4299
4300     s->current_picture.coded_picture_number = avctx->frame_number;
4301     s->current_picture.pict_type = pict->pict_type;
4302     s->current_picture.quality = pict->quality;
4303     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4304     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
4305     s->m.current_picture.display_picture_number =
4306     s->m.current_picture.coded_picture_number = avctx->frame_number;
4307     s->m.current_picture.quality = pict->quality;
4308     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4309     if(s->pass1_rc)
4310         ff_rate_estimate_qscale(&s->m, 0);
4311     if(avctx->flags&CODEC_FLAG_PASS1)
4312         ff_write_pass1_stats(&s->m);
4313     s->m.last_pict_type = s->m.pict_type;
4314
4315     emms_c();
4316
4317     return ff_rac_terminate(c);
4318 }
4319
4320 static void common_end(SnowContext *s){
4321     int plane_index, level, orientation, i;
4322
4323     av_freep(&s->spatial_dwt_buffer);
4324
4325     av_freep(&s->m.me.scratchpad);
4326     av_freep(&s->m.me.map);
4327     av_freep(&s->m.me.score_map);
4328     av_freep(&s->m.obmc_scratchpad);
4329
4330     av_freep(&s->block);
4331
4332     for(i=0; i<MAX_REF_FRAMES; i++){
4333         av_freep(&s->ref_mvs[i]);
4334         av_freep(&s->ref_scores[i]);
4335         if(s->last_picture[i].data[0])
4336             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
4337     }
4338
4339     for(plane_index=0; plane_index<3; plane_index++){
4340         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4341             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4342                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4343
4344                 av_freep(&b->x_coeff);
4345             }
4346         }
4347     }
4348 }
4349
4350 static int encode_end(AVCodecContext *avctx)
4351 {
4352     SnowContext *s = avctx->priv_data;
4353
4354     common_end(s);
4355     av_free(avctx->stats_out);
4356
4357     return 0;
4358 }
4359
4360 static int decode_init(AVCodecContext *avctx)
4361 {
4362     SnowContext *s = avctx->priv_data;
4363     int block_size;
4364
4365     avctx->pix_fmt= PIX_FMT_YUV420P;
4366
4367     common_init(avctx);
4368
4369     block_size = MB_SIZE >> s->block_max_depth;
4370     slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 3)) + 1, s->plane[0].width, s->spatial_dwt_buffer);
4371
4372     return 0;
4373 }
4374
4375 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
4376     SnowContext *s = avctx->priv_data;
4377     RangeCoder * const c= &s->c;
4378     int bytes_read;
4379     AVFrame *picture = data;
4380     int level, orientation, plane_index;
4381
4382     ff_init_range_decoder(c, buf, buf_size);
4383     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4384
4385     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4386     decode_header(s);
4387     if(!s->block) alloc_blocks(s);
4388
4389     frame_start(s);
4390     //keyframe flag dupliaction mess FIXME
4391     if(avctx->debug&FF_DEBUG_PICT_INFO)
4392         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4393
4394     decode_blocks(s);
4395
4396     for(plane_index=0; plane_index<3; plane_index++){
4397         Plane *p= &s->plane[plane_index];
4398         int w= p->width;
4399         int h= p->height;
4400         int x, y;
4401         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4402
4403 if(s->avctx->debug&2048){
4404         memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4405         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4406
4407         for(y=0; y<h; y++){
4408             for(x=0; x<w; x++){
4409                 int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4410                 s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4411             }
4412         }
4413 }
4414
4415 {   START_TIMER
4416     for(level=0; level<s->spatial_decomposition_count; level++){
4417         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4418             SubBand *b= &p->band[level][orientation];
4419             unpack_coeffs(s, b, b->parent, orientation);
4420         }
4421     }
4422     STOP_TIMER("unpack coeffs");
4423 }
4424
4425 {START_TIMER
4426     const int mb_h= s->b_height << s->block_max_depth;
4427     const int block_size = MB_SIZE >> s->block_max_depth;
4428     const int block_w    = plane_index ? block_size/2 : block_size;
4429     int mb_y;
4430     dwt_compose_t cs[MAX_DECOMPOSITIONS];
4431     int yd=0, yq=0;
4432     int y;
4433     int end_y;
4434
4435     ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4436     for(mb_y=0; mb_y<=mb_h; mb_y++){
4437
4438         int slice_starty = block_w*mb_y;
4439         int slice_h = block_w*(mb_y+1);
4440         if (!(s->keyframe || s->avctx->debug&512)){
4441             slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4442             slice_h -= (block_w >> 1);
4443         }
4444
4445         {
4446         START_TIMER
4447         for(level=0; level<s->spatial_decomposition_count; level++){
4448             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4449                 SubBand *b= &p->band[level][orientation];
4450                 int start_y;
4451                 int end_y;
4452                 int our_mb_start = mb_y;
4453                 int our_mb_end = (mb_y + 1);
4454                 const int extra= 3;
4455                 start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4456                 end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4457                 if (!(s->keyframe || s->avctx->debug&512)){
4458                     start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4459                     end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4460                 }
4461                 start_y = FFMIN(b->height, start_y);
4462                 end_y = FFMIN(b->height, end_y);
4463
4464                 if (start_y != end_y){
4465                     if (orientation == 0){
4466                         SubBand * correlate_band = &p->band[0][0];
4467                         int correlate_end_y = FFMIN(b->height, end_y + 1);
4468                         int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4469                         decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4470                         correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4471                         dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, start_y, end_y);
4472                     }
4473                     else
4474                         decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4475                 }
4476             }
4477         }
4478         STOP_TIMER("decode_subband_slice");
4479         }
4480
4481 {   START_TIMER
4482         for(; yd<slice_h; yd+=4){
4483             ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4484         }
4485     STOP_TIMER("idwt slice");}
4486
4487
4488         if(s->qlog == LOSSLESS_QLOG){
4489             for(; yq<slice_h && yq<h; yq++){
4490                 DWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4491                 for(x=0; x<w; x++){
4492                     line[x] <<= FRAC_BITS;
4493                 }
4494             }
4495         }
4496
4497         predict_slice_buffered(s, &s->sb, s->spatial_dwt_buffer, plane_index, 1, mb_y);
4498
4499         y = FFMIN(p->height, slice_starty);
4500         end_y = FFMIN(p->height, slice_h);
4501         while(y < end_y)
4502             slice_buffer_release(&s->sb, y++);
4503     }
4504
4505     slice_buffer_flush(&s->sb);
4506
4507 STOP_TIMER("idwt + predict_slices")}
4508     }
4509
4510     emms_c();
4511
4512     if(s->last_picture[s->max_ref_frames-1].data[0])
4513         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4514
4515 if(!(s->avctx->debug&2048))
4516     *picture= s->current_picture;
4517 else
4518     *picture= s->mconly_picture;
4519
4520     *data_size = sizeof(AVFrame);
4521
4522     bytes_read= c->bytestream - c->bytestream_start;
4523     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4524
4525     return bytes_read;
4526 }
4527
4528 static int decode_end(AVCodecContext *avctx)
4529 {
4530     SnowContext *s = avctx->priv_data;
4531
4532     slice_buffer_destroy(&s->sb);
4533
4534     common_end(s);
4535
4536     return 0;
4537 }
4538
4539 AVCodec snow_decoder = {
4540     "snow",
4541     CODEC_TYPE_VIDEO,
4542     CODEC_ID_SNOW,
4543     sizeof(SnowContext),
4544     decode_init,
4545     NULL,
4546     decode_end,
4547     decode_frame,
4548     0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4549     NULL
4550 };
4551
4552 #ifdef CONFIG_ENCODERS
4553 AVCodec snow_encoder = {
4554     "snow",
4555     CODEC_TYPE_VIDEO,
4556     CODEC_ID_SNOW,
4557     sizeof(SnowContext),
4558     encode_init,
4559     encode_frame,
4560     encode_end,
4561 };
4562 #endif
4563
4564
4565 #if 0
4566 #undef malloc
4567 #undef free
4568 #undef printf
4569
4570 int main(){
4571     int width=256;
4572     int height=256;
4573     int buffer[2][width*height];
4574     SnowContext s;
4575     int i;
4576     s.spatial_decomposition_count=6;
4577     s.spatial_decomposition_type=1;
4578
4579     printf("testing 5/3 DWT\n");
4580     for(i=0; i<width*height; i++)
4581         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4582
4583     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4584     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4585
4586     for(i=0; i<width*height; i++)
4587         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4588
4589     printf("testing 9/7 DWT\n");
4590     s.spatial_decomposition_type=0;
4591     for(i=0; i<width*height; i++)
4592         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4593
4594     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4595     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4596
4597     for(i=0; i<width*height; i++)
4598         if(ABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4599
4600 #if 0
4601     printf("testing AC coder\n");
4602     memset(s.header_state, 0, sizeof(s.header_state));
4603     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4604     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4605
4606     for(i=-256; i<256; i++){
4607 START_TIMER
4608         put_symbol(&s.c, s.header_state, i*i*i/3*ABS(i), 1);
4609 STOP_TIMER("put_symbol")
4610     }
4611     ff_rac_terminate(&s.c);
4612
4613     memset(s.header_state, 0, sizeof(s.header_state));
4614     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4615     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4616
4617     for(i=-256; i<256; i++){
4618         int j;
4619 START_TIMER
4620         j= get_symbol(&s.c, s.header_state, 1);
4621 STOP_TIMER("get_symbol")
4622         if(j!=i*i*i/3*ABS(i)) printf("fsck: %d != %d\n", i, j);
4623     }
4624 #endif
4625 {
4626 int level, orientation, x, y;
4627 int64_t errors[8][4];
4628 int64_t g=0;
4629
4630     memset(errors, 0, sizeof(errors));
4631     s.spatial_decomposition_count=3;
4632     s.spatial_decomposition_type=0;
4633     for(level=0; level<s.spatial_decomposition_count; level++){
4634         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4635             int w= width  >> (s.spatial_decomposition_count-level);
4636             int h= height >> (s.spatial_decomposition_count-level);
4637             int stride= width  << (s.spatial_decomposition_count-level);
4638             DWTELEM *buf= buffer[0];
4639             int64_t error=0;
4640
4641             if(orientation&1) buf+=w;
4642             if(orientation>1) buf+=stride>>1;
4643
4644             memset(buffer[0], 0, sizeof(int)*width*height);
4645             buf[w/2 + h/2*stride]= 256*256;
4646             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4647             for(y=0; y<height; y++){
4648                 for(x=0; x<width; x++){
4649                     int64_t d= buffer[0][x + y*width];
4650                     error += d*d;
4651                     if(ABS(width/2-x)<9 && ABS(height/2-y)<9 && level==2) printf("%8lld ", d);
4652                 }
4653                 if(ABS(height/2-y)<9 && level==2) printf("\n");
4654             }
4655             error= (int)(sqrt(error)+0.5);
4656             errors[level][orientation]= error;
4657             if(g) g=ff_gcd(g, error);
4658             else g= error;
4659         }
4660     }
4661     printf("static int const visual_weight[][4]={\n");
4662     for(level=0; level<s.spatial_decomposition_count; level++){
4663         printf("  {");
4664         for(orientation=0; orientation<4; orientation++){
4665             printf("%8lld,", errors[level][orientation]/g);
4666         }
4667         printf("},\n");
4668     }
4669     printf("};\n");
4670     {
4671             int level=2;
4672             int orientation=3;
4673             int w= width  >> (s.spatial_decomposition_count-level);
4674             int h= height >> (s.spatial_decomposition_count-level);
4675             int stride= width  << (s.spatial_decomposition_count-level);
4676             DWTELEM *buf= buffer[0];
4677             int64_t error=0;
4678
4679             buf+=w;
4680             buf+=stride>>1;
4681
4682             memset(buffer[0], 0, sizeof(int)*width*height);
4683 #if 1
4684             for(y=0; y<height; y++){
4685                 for(x=0; x<width; x++){
4686                     int tab[4]={0,2,3,1};
4687                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4688                 }
4689             }
4690             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4691 #else
4692             for(y=0; y<h; y++){
4693                 for(x=0; x<w; x++){
4694                     buf[x + y*stride  ]=169;
4695                     buf[x + y*stride-w]=64;
4696                 }
4697             }
4698             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4699 #endif
4700             for(y=0; y<height; y++){
4701                 for(x=0; x<width; x++){
4702                     int64_t d= buffer[0][x + y*width];
4703                     error += d*d;
4704                     if(ABS(width/2-x)<9 && ABS(height/2-y)<9) printf("%8lld ", d);
4705                 }
4706                 if(ABS(height/2-y)<9) printf("\n");
4707             }
4708     }
4709
4710 }
4711     return 0;
4712 }
4713 #endif
4714