git.sesse.net Git - ffmpeg/blob - libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  */
  18
  19 #include "avcodec.h"
  20 #include "common.h"
  21 #include "dsputil.h"
  22 #include "snow.h"
  23
  24 #include "rangecoder.h"
  25
  26 #include "mpegvideo.h"
  27
  28 #undef NDEBUG
  29 #include <assert.h>
  30
  31 static const int8_t quant3[256]={
  32  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  33  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  41 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  48 };
  49 static const int8_t quant3b[256]={
  50  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  59 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 };
  67 static const int8_t quant3bA[256]={
  68  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  69  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  70  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84 };
  85 static const int8_t quant5[256]={
  86  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  87  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  88  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  95 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 102 };
 103 static const int8_t quant7[256]={
 104  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 105  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 107  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 108  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 113 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 118 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 119 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 120 };
 121 static const int8_t quant9[256]={
 122  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 123  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 124  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 125  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 131 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 137 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 138 };
 139 static const int8_t quant11[256]={
 140  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 141  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 142  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 143  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 144  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 154 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 155 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 156 };
 157 static const int8_t quant13[256]={
 158  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 159  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 160  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 161  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 162  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 163  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 167 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 171 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 173 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 174 };
 175
 176 #if 0 //64*cubic
 177 static const uint8_t obmc32[1024]={
 178  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 179  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 180  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 181  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 182  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 183  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 184  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 185  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 186  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 187  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 188  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 189  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 190  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 191  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 192  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 193  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 194  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 195  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 196  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 197  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 198  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 199  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 200  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 201  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 202  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 203  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 204  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 205  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 206  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 207  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 208  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 209  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 210 //error:0.000022
 211 };
 212 static const uint8_t obmc16[256]={
 213  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 214  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 215  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 216  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 217  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 218  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 219  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 220  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 221  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 222  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 223  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 224  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 225  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 226  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 227  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 228  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 229 //error:0.000033
 230 };
 231 #elif 1 // 64*linear
 232 static const uint8_t obmc32[1024]={
 233   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 234   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 235   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 236   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 237   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 238   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 239   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 240   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 241   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 242   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 243   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 244   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 245   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 246   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 247   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 248   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 249   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 250   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 251   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 252   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 253   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 254   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 255   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 256   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 257   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 258   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 259   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 260   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 261   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 262   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 263   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 264   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 265  //error:0.000020
 266 };
 267 static const uint8_t obmc16[256]={
 268   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 269   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 270   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 271   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 272   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 273  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 274  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 275  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 276  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 277  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 278  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 279   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 280   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 281   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 282   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 283   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 284 //error:0.000015
 285 };
 286 #else //64*cos
 287 static const uint8_t obmc32[1024]={
 288  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 289  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 290  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 291  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 292  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 293  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 294  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 295  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 296  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 297  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 298  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 299  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 300  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 301  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 302  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 303  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 304  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 305  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 306  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 307  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 308  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 309  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 310  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 311  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 312  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 313  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 314  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 315  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 316  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 317  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 318  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 319  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 320 //error:0.000022
 321 };
 322 static const uint8_t obmc16[256]={
 323  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 324  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 325  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 326  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 327  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 328  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 329  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 330  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 331  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 332  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 333  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 334  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 335  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 336  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 337  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 338  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 339 //error:0.000022
 340 };
 341 #endif
 342
 343 //linear *64
 344 static const uint8_t obmc8[64]={
 345   4, 12, 20, 28, 28, 20, 12,  4,
 346  12, 36, 60, 84, 84, 60, 36, 12,
 347  20, 60,100,140,140,100, 60, 20,
 348  28, 84,140,196,196,140, 84, 28,
 349  28, 84,140,196,196,140, 84, 28,
 350  20, 60,100,140,140,100, 60, 20,
 351  12, 36, 60, 84, 84, 60, 36, 12,
 352   4, 12, 20, 28, 28, 20, 12,  4,
 353 //error:0.000000
 354 };
 355
 356 //linear *64
 357 static const uint8_t obmc4[16]={
 358  16, 48, 48, 16,
 359  48,144,144, 48,
 360  48,144,144, 48,
 361  16, 48, 48, 16,
 362 //error:0.000000
 363 };
 364
 365 static const uint8_t *obmc_tab[4]={
 366     obmc32, obmc16, obmc8, obmc4
 367 };
 368
 369 typedef struct BlockNode{
 370     int16_t mx;
 371     int16_t my;
 372     uint8_t color[3];
 373     uint8_t type;
 374 //#define TYPE_SPLIT    1
 375 #define BLOCK_INTRA   1
 376 #define BLOCK_OPT     2
 377 //#define TYPE_NOCOLOR  4
 378     uint8_t level; //FIXME merge into type?
 379 }BlockNode;
 380
 381 static const BlockNode null_block= { //FIXME add border maybe
 382     .color= {128,128,128},
 383     .mx= 0,
 384     .my= 0,
 385     .type= 0,
 386     .level= 0,
 387 };
 388
 389 #define LOG2_MB_SIZE 4
 390 #define MB_SIZE (1<<LOG2_MB_SIZE)
 391
 392 typedef struct x_and_coeff{
 393     int16_t x;
 394     uint16_t coeff;
 395 } x_and_coeff;
 396
 397 typedef struct SubBand{
 398     int level;
 399     int stride;
 400     int width;
 401     int height;
 402     int qlog;                                   ///< log(qscale)/log[2^(1/6)]
 403     DWTELEM *buf;
 404     int buf_x_offset;
 405     int buf_y_offset;
 406     int stride_line; ///< Stride measured in lines, not pixels.
 407     x_and_coeff * x_coeff;
 408     struct SubBand *parent;
 409     uint8_t state[/*7*2*/ 7 + 512][32];
 410 }SubBand;
 411
 412 typedef struct Plane{
 413     int width;
 414     int height;
 415     SubBand band[MAX_DECOMPOSITIONS][4];
 416 }Plane;
 417
 418 typedef struct SnowContext{
 419 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 420
 421     AVCodecContext *avctx;
 422     RangeCoder c;
 423     DSPContext dsp;
 424     AVFrame new_picture;
 425     AVFrame input_picture;              ///< new_picture with the internal linesizes
 426     AVFrame current_picture;
 427     AVFrame last_picture;
 428     AVFrame mconly_picture;
 429 //     uint8_t q_context[16];
 430     uint8_t header_state[32];
 431     uint8_t block_state[128 + 32*128];
 432     int keyframe;
 433     int always_reset;
 434     int version;
 435     int spatial_decomposition_type;
 436     int temporal_decomposition_type;
 437     int spatial_decomposition_count;
 438     int temporal_decomposition_count;
 439     DWTELEM *spatial_dwt_buffer;
 440     int colorspace_type;
 441     int chroma_h_shift;
 442     int chroma_v_shift;
 443     int spatial_scalability;
 444     int qlog;
 445     int lambda;
 446     int lambda2;
 447     int mv_scale;
 448     int qbias;
 449 #define QBIAS_SHIFT 3
 450     int b_width;
 451     int b_height;
 452     int block_max_depth;
 453     Plane plane[MAX_PLANES];
 454     BlockNode *block;
 455 #define ME_CACHE_SIZE 1024
 456     int me_cache[ME_CACHE_SIZE];
 457     int me_cache_generation;
 458     slice_buffer sb;
 459
 460     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independant of MpegEncContext, so this will be removed then (FIXME/XXX)
 461 }SnowContext;
 462
 463 typedef struct {
 464     DWTELEM *b0;
 465     DWTELEM *b1;
 466     DWTELEM *b2;
 467     DWTELEM *b3;
 468     int y;
 469 } dwt_compose_t;
 470
 471 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 472 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 473
 474 static void iterative_me(SnowContext *s);
 475
 476 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, DWTELEM * base_buffer)
 477 {
 478     int i;
 479
 480     buf->base_buffer = base_buffer;
 481     buf->line_count = line_count;
 482     buf->line_width = line_width;
 483     buf->data_count = max_allocated_lines;
 484     buf->line = (DWTELEM * *) av_mallocz (sizeof(DWTELEM *) * line_count);
 485     buf->data_stack = (DWTELEM * *) av_malloc (sizeof(DWTELEM *) * max_allocated_lines);
 486
 487     for (i = 0; i < max_allocated_lines; i++)
 488     {
 489       buf->data_stack[i] = (DWTELEM *) av_malloc (sizeof(DWTELEM) * line_width);
 490     }
 491
 492     buf->data_stack_top = max_allocated_lines - 1;
 493 }
 494
 495 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 496 {
 497     int offset;
 498     DWTELEM * buffer;
 499
 500 //  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
 501
 502     assert(buf->data_stack_top >= 0);
 503 //  assert(!buf->line[line]);
 504     if (buf->line[line])
 505         return buf->line[line];
 506
 507     offset = buf->line_width * line;
 508     buffer = buf->data_stack[buf->data_stack_top];
 509     buf->data_stack_top--;
 510     buf->line[line] = buffer;
 511
 512 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 513
 514     return buffer;
 515 }
 516
 517 static void slice_buffer_release(slice_buffer * buf, int line)
 518 {
 519     int offset;
 520     DWTELEM * buffer;
 521
 522     assert(line >= 0 && line < buf->line_count);
 523     assert(buf->line[line]);
 524
 525     offset = buf->line_width * line;
 526     buffer = buf->line[line];
 527     buf->data_stack_top++;
 528     buf->data_stack[buf->data_stack_top] = buffer;
 529     buf->line[line] = NULL;
 530
 531 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 532 }
 533
 534 static void slice_buffer_flush(slice_buffer * buf)
 535 {
 536     int i;
 537     for (i = 0; i < buf->line_count; i++)
 538     {
 539         if (buf->line[i])
 540         {
 541 //      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
 542             slice_buffer_release(buf, i);
 543         }
 544     }
 545 }
 546
 547 static void slice_buffer_destroy(slice_buffer * buf)
 548 {
 549     int i;
 550     slice_buffer_flush(buf);
 551
 552     for (i = buf->data_count - 1; i >= 0; i--)
 553     {
 554         assert(buf->data_stack[i]);
 555         av_freep(&buf->data_stack[i]);
 556     }
 557     assert(buf->data_stack);
 558     av_freep(&buf->data_stack);
 559     assert(buf->line);
 560     av_freep(&buf->line);
 561 }
 562
 563 #ifdef __sgi
 564 // Avoid a name clash on SGI IRIX
 565 #undef qexp
 566 #endif
 567 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 568 static uint8_t qexp[QROOT];
 569
 570 static inline int mirror(int v, int m){
 571     while((unsigned)v > (unsigned)m){
 572         v=-v;
 573         if(v<0) v+= 2*m;
 574     }
 575     return v;
 576 }
 577
 578 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 579     int i;
 580
 581     if(v){
 582         const int a= ABS(v);
 583         const int e= av_log2(a);
 584 #if 1
 585         const int el= FFMIN(e, 10);
 586         put_rac(c, state+0, 0);
 587
 588         for(i=0; i<el; i++){
 589             put_rac(c, state+1+i, 1);  //1..10
 590         }
 591         for(; i<e; i++){
 592             put_rac(c, state+1+9, 1);  //1..10
 593         }
 594         put_rac(c, state+1+FFMIN(i,9), 0);
 595
 596         for(i=e-1; i>=el; i--){
 597             put_rac(c, state+22+9, (a>>i)&1); //22..31
 598         }
 599         for(; i>=0; i--){
 600             put_rac(c, state+22+i, (a>>i)&1); //22..31
 601         }
 602
 603         if(is_signed)
 604             put_rac(c, state+11 + el, v < 0); //11..21
 605 #else
 606
 607         put_rac(c, state+0, 0);
 608         if(e<=9){
 609             for(i=0; i<e; i++){
 610                 put_rac(c, state+1+i, 1);  //1..10
 611             }
 612             put_rac(c, state+1+i, 0);
 613
 614             for(i=e-1; i>=0; i--){
 615                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 616             }
 617
 618             if(is_signed)
 619                 put_rac(c, state+11 + e, v < 0); //11..21
 620         }else{
 621             for(i=0; i<e; i++){
 622                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 623             }
 624             put_rac(c, state+1+FFMIN(i,9), 0);
 625
 626             for(i=e-1; i>=0; i--){
 627                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 628             }
 629
 630             if(is_signed)
 631                 put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
 632         }
 633 #endif
 634     }else{
 635         put_rac(c, state+0, 1);
 636     }
 637 }
 638
 639 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 640     if(get_rac(c, state+0))
 641         return 0;
 642     else{
 643         int i, e, a;
 644         e= 0;
 645         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 646             e++;
 647         }
 648
 649         a= 1;
 650         for(i=e-1; i>=0; i--){
 651             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 652         }
 653
 654         if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
 655             return -a;
 656         else
 657             return a;
 658     }
 659 }
 660
 661 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 662     int i;
 663     int r= log2>=0 ? 1<<log2 : 1;
 664
 665     assert(v>=0);
 666     assert(log2>=-4);
 667
 668     while(v >= r){
 669         put_rac(c, state+4+log2, 1);
 670         v -= r;
 671         log2++;
 672         if(log2>0) r+=r;
 673     }
 674     put_rac(c, state+4+log2, 0);
 675
 676     for(i=log2-1; i>=0; i--){
 677         put_rac(c, state+31-i, (v>>i)&1);
 678     }
 679 }
 680
 681 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 682     int i;
 683     int r= log2>=0 ? 1<<log2 : 1;
 684     int v=0;
 685
 686     assert(log2>=-4);
 687
 688     while(get_rac(c, state+4+log2)){
 689         v+= r;
 690         log2++;
 691         if(log2>0) r+=r;
 692     }
 693
 694     for(i=log2-1; i>=0; i--){
 695         v+= get_rac(c, state+31-i)<<i;
 696     }
 697
 698     return v;
 699 }
 700
 701 static always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 702     const int mirror_left= !highpass;
 703     const int mirror_right= (width&1) ^ highpass;
 704     const int w= (width>>1) - 1 + (highpass & width);
 705     int i;
 706
 707 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 708     if(mirror_left){
 709         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 710         dst += dst_step;
 711         src += src_step;
 712     }
 713
 714     for(i=0; i<w; i++){
 715         dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
 716     }
 717
 718     if(mirror_right){
 719         dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
 720     }
 721 }
 722
 723 #ifndef lift5
 724 static always_inline void lift5(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 725     const int mirror_left= !highpass;
 726     const int mirror_right= (width&1) ^ highpass;
 727     const int w= (width>>1) - 1 + (highpass & width);
 728     int i;
 729
 730     if(mirror_left){
 731         int r= 3*2*ref[0];
 732         r += r>>4;
 733         r += r>>8;
 734         dst[0] = LIFT(src[0], ((r+add)>>shift), inverse);
 735         dst += dst_step;
 736         src += src_step;
 737     }
 738
 739     for(i=0; i<w; i++){
 740         int r= 3*(ref[i*ref_step] + ref[(i+1)*ref_step]);
 741         r += r>>4;
 742         r += r>>8;
 743         dst[i*dst_step] = LIFT(src[i*src_step], ((r+add)>>shift), inverse);
 744     }
 745
 746     if(mirror_right){
 747         int r= 3*2*ref[w*ref_step];
 748         r += r>>4;
 749         r += r>>8;
 750         dst[w*dst_step] = LIFT(src[w*src_step], ((r+add)>>shift), inverse);
 751     }
 752 }
 753 #endif
 754
 755 #ifndef liftS
 756 static always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 757     const int mirror_left= !highpass;
 758     const int mirror_right= (width&1) ^ highpass;
 759     const int w= (width>>1) - 1 + (highpass & width);
 760     int i;
 761
 762     assert(shift == 4);
 763 #define LIFTS(src, ref, inv) ((inv) ? (src) - (((ref) - 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
 764     if(mirror_left){
 765         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 766         dst += dst_step;
 767         src += src_step;
 768     }
 769
 770     for(i=0; i<w; i++){
 771         dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
 772     }
 773
 774     if(mirror_right){
 775         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 776     }
 777 }
 778 #endif
 779
 780
 781 static void inplace_lift(DWTELEM *dst, int width, int *coeffs, int n, int shift, int start, int inverse){
 782     int x, i;
 783
 784     for(x=start; x<width; x+=2){
 785         int64_t sum=0;
 786
 787         for(i=0; i<n; i++){
 788             int x2= x + 2*i - n + 1;
 789             if     (x2<     0) x2= -x2;
 790             else if(x2>=width) x2= 2*width-x2-2;
 791             sum += coeffs[i]*(int64_t)dst[x2];
 792         }
 793         if(inverse) dst[x] -= (sum + (1<<shift)/2)>>shift;
 794         else        dst[x] += (sum + (1<<shift)/2)>>shift;
 795     }
 796 }
 797
 798 static void inplace_liftV(DWTELEM *dst, int width, int height, int stride, int *coeffs, int n, int shift, int start, int inverse){
 799     int x, y, i;
 800     for(y=start; y<height; y+=2){
 801         for(x=0; x<width; x++){
 802             int64_t sum=0;
 803
 804             for(i=0; i<n; i++){
 805                 int y2= y + 2*i - n + 1;
 806                 if     (y2<      0) y2= -y2;
 807                 else if(y2>=height) y2= 2*height-y2-2;
 808                 sum += coeffs[i]*(int64_t)dst[x + y2*stride];
 809             }
 810             if(inverse) dst[x + y*stride] -= (sum + (1<<shift)/2)>>shift;
 811             else        dst[x + y*stride] += (sum + (1<<shift)/2)>>shift;
 812         }
 813     }
 814 }
 815
 816 #define SCALEX 1
 817 #define LX0 0
 818 #define LX1 1
 819
 820 #if 0 // more accurate 9/7
 821 #define N1 2
 822 #define SHIFT1 14
 823 #define COEFFS1 (int[]){-25987,-25987}
 824 #define N2 2
 825 #define SHIFT2 19
 826 #define COEFFS2 (int[]){-27777,-27777}
 827 #define N3 2
 828 #define SHIFT3 15
 829 #define COEFFS3 (int[]){28931,28931}
 830 #define N4 2
 831 #define SHIFT4 15
 832 #define COEFFS4 (int[]){14533,14533}
 833 #elif 1 // 13/7 CRF
 834 #define N1 4
 835 #define SHIFT1 4
 836 #define COEFFS1 (int[]){1,-9,-9,1}
 837 #define N2 4
 838 #define SHIFT2 4
 839 #define COEFFS2 (int[]){-1,5,5,-1}
 840 #define N3 0
 841 #define SHIFT3 1
 842 #define COEFFS3 NULL
 843 #define N4 0
 844 #define SHIFT4 1
 845 #define COEFFS4 NULL
 846 #elif 1 // 3/5
 847 #define LX0 1
 848 #define LX1 0
 849 #define SCALEX 0.5
 850 #define N1 2
 851 #define SHIFT1 1
 852 #define COEFFS1 (int[]){1,1}
 853 #define N2 2
 854 #define SHIFT2 2
 855 #define COEFFS2 (int[]){-1,-1}
 856 #define N3 0
 857 #define SHIFT3 0
 858 #define COEFFS3 NULL
 859 #define N4 0
 860 #define SHIFT4 0
 861 #define COEFFS4 NULL
 862 #elif 1 // 11/5
 863 #define N1 0
 864 #define SHIFT1 1
 865 #define COEFFS1 NULL
 866 #define N2 2
 867 #define SHIFT2 2
 868 #define COEFFS2 (int[]){-1,-1}
 869 #define N3 2
 870 #define SHIFT3 0
 871 #define COEFFS3 (int[]){-1,-1}
 872 #define N4 4
 873 #define SHIFT4 7
 874 #define COEFFS4 (int[]){-5,29,29,-5}
 875 #define SCALEX 4
 876 #elif 1 // 9/7 CDF
 877 #define N1 2
 878 #define SHIFT1 7
 879 #define COEFFS1 (int[]){-203,-203}
 880 #define N2 2
 881 #define SHIFT2 12
 882 #define COEFFS2 (int[]){-217,-217}
 883 #define N3 2
 884 #define SHIFT3 7
 885 #define COEFFS3 (int[]){113,113}
 886 #define N4 2
 887 #define SHIFT4 9
 888 #define COEFFS4 (int[]){227,227}
 889 #define SCALEX 1
 890 #elif 1 // 7/5 CDF
 891 #define N1 0
 892 #define SHIFT1 1
 893 #define COEFFS1 NULL
 894 #define N2 2
 895 #define SHIFT2 2
 896 #define COEFFS2 (int[]){-1,-1}
 897 #define N3 2
 898 #define SHIFT3 0
 899 #define COEFFS3 (int[]){-1,-1}
 900 #define N4 2
 901 #define SHIFT4 4
 902 #define COEFFS4 (int[]){3,3}
 903 #elif 1 // 9/7 MN
 904 #define N1 4
 905 #define SHIFT1 4
 906 #define COEFFS1 (int[]){1,-9,-9,1}
 907 #define N2 2
 908 #define SHIFT2 2
 909 #define COEFFS2 (int[]){1,1}
 910 #define N3 0
 911 #define SHIFT3 1
 912 #define COEFFS3 NULL
 913 #define N4 0
 914 #define SHIFT4 1
 915 #define COEFFS4 NULL
 916 #else // 13/7 CRF
 917 #define N1 4
 918 #define SHIFT1 4
 919 #define COEFFS1 (int[]){1,-9,-9,1}
 920 #define N2 4
 921 #define SHIFT2 4
 922 #define COEFFS2 (int[]){-1,5,5,-1}
 923 #define N3 0
 924 #define SHIFT3 1
 925 #define COEFFS3 NULL
 926 #define N4 0
 927 #define SHIFT4 1
 928 #define COEFFS4 NULL
 929 #endif
 930 static void horizontal_decomposeX(DWTELEM *b, int width){
 931     DWTELEM temp[width];
 932     const int width2= width>>1;
 933     const int w2= (width+1)>>1;
 934     int x;
 935
 936     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 0);
 937     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 0);
 938     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 0);
 939     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 0);
 940
 941     for(x=0; x<width2; x++){
 942         temp[x   ]= b[2*x    ];
 943         temp[x+w2]= b[2*x + 1];
 944     }
 945     if(width&1)
 946         temp[x   ]= b[2*x    ];
 947     memcpy(b, temp, width*sizeof(int));
 948 }
 949
 950 static void horizontal_composeX(DWTELEM *b, int width){
 951     DWTELEM temp[width];
 952     const int width2= width>>1;
 953     int x;
 954     const int w2= (width+1)>>1;
 955
 956     memcpy(temp, b, width*sizeof(int));
 957     for(x=0; x<width2; x++){
 958         b[2*x    ]= temp[x   ];
 959         b[2*x + 1]= temp[x+w2];
 960     }
 961     if(width&1)
 962         b[2*x    ]= temp[x   ];
 963
 964     inplace_lift(b, width, COEFFS4, N4, SHIFT4, LX0, 1);
 965     inplace_lift(b, width, COEFFS3, N3, SHIFT3, LX1, 1);
 966     inplace_lift(b, width, COEFFS2, N2, SHIFT2, LX0, 1);
 967     inplace_lift(b, width, COEFFS1, N1, SHIFT1, LX1, 1);
 968 }
 969
 970 static void spatial_decomposeX(DWTELEM *buffer, int width, int height, int stride){
 971     int x, y;
 972
 973     for(y=0; y<height; y++){
 974         for(x=0; x<width; x++){
 975             buffer[y*stride + x] *= SCALEX;
 976         }
 977     }
 978
 979     for(y=0; y<height; y++){
 980         horizontal_decomposeX(buffer + y*stride, width);
 981     }
 982
 983     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 0);
 984     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 0);
 985     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 0);
 986     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 0);
 987 }
 988
 989 static void spatial_composeX(DWTELEM *buffer, int width, int height, int stride){
 990     int x, y;
 991
 992     inplace_liftV(buffer, width, height, stride, COEFFS4, N4, SHIFT4, LX0, 1);
 993     inplace_liftV(buffer, width, height, stride, COEFFS3, N3, SHIFT3, LX1, 1);
 994     inplace_liftV(buffer, width, height, stride, COEFFS2, N2, SHIFT2, LX0, 1);
 995     inplace_liftV(buffer, width, height, stride, COEFFS1, N1, SHIFT1, LX1, 1);
 996
 997     for(y=0; y<height; y++){
 998         horizontal_composeX(buffer + y*stride, width);
 999     }
1000
1001     for(y=0; y<height; y++){
1002         for(x=0; x<width; x++){
1003             buffer[y*stride + x] /= SCALEX;
1004         }
1005     }
1006 }
1007
1008 static void horizontal_decompose53i(DWTELEM *b, int width){
1009     DWTELEM temp[width];
1010     const int width2= width>>1;
1011     int x;
1012     const int w2= (width+1)>>1;
1013
1014     for(x=0; x<width2; x++){
1015         temp[x   ]= b[2*x    ];
1016         temp[x+w2]= b[2*x + 1];
1017     }
1018     if(width&1)
1019         temp[x   ]= b[2*x    ];
1020 #if 0
1021     {
1022     int A1,A2,A3,A4;
1023     A2= temp[1       ];
1024     A4= temp[0       ];
1025     A1= temp[0+width2];
1026     A1 -= (A2 + A4)>>1;
1027     A4 += (A1 + 1)>>1;
1028     b[0+width2] = A1;
1029     b[0       ] = A4;
1030     for(x=1; x+1<width2; x+=2){
1031         A3= temp[x+width2];
1032         A4= temp[x+1     ];
1033         A3 -= (A2 + A4)>>1;
1034         A2 += (A1 + A3 + 2)>>2;
1035         b[x+width2] = A3;
1036         b[x       ] = A2;
1037
1038         A1= temp[x+1+width2];
1039         A2= temp[x+2       ];
1040         A1 -= (A2 + A4)>>1;
1041         A4 += (A1 + A3 + 2)>>2;
1042         b[x+1+width2] = A1;
1043         b[x+1       ] = A4;
1044     }
1045     A3= temp[width-1];
1046     A3 -= A2;
1047     A2 += (A1 + A3 + 2)>>2;
1048     b[width -1] = A3;
1049     b[width2-1] = A2;
1050     }
1051 #else
1052     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
1053     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
1054 #endif
1055 }
1056
1057 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1058     int i;
1059
1060     for(i=0; i<width; i++){
1061         b1[i] -= (b0[i] + b2[i])>>1;
1062     }
1063 }
1064
1065 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1066     int i;
1067
1068     for(i=0; i<width; i++){
1069         b1[i] += (b0[i] + b2[i] + 2)>>2;
1070     }
1071 }
1072
1073 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
1074     int y;
1075     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
1076     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
1077
1078     for(y=-2; y<height; y+=2){
1079         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1080         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1081
1082 {START_TIMER
1083         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
1084         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
1085 STOP_TIMER("horizontal_decompose53i")}
1086
1087 {START_TIMER
1088         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
1089         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
1090 STOP_TIMER("vertical_decompose53i*")}
1091
1092         b0=b2;
1093         b1=b3;
1094     }
1095 }
1096
1097 static void horizontal_decompose97i(DWTELEM *b, int width){
1098     DWTELEM temp[width];
1099     const int w2= (width+1)>>1;
1100
1101     lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
1102     liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
1103     lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
1104     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
1105 }
1106
1107
1108 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1109     int i;
1110
1111     for(i=0; i<width; i++){
1112         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1113     }
1114 }
1115
1116 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1117     int i;
1118
1119     for(i=0; i<width; i++){
1120 #ifdef lift5
1121         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1122 #else
1123         int r= 3*(b0[i] + b2[i]);
1124         r+= r>>4;
1125         r+= r>>8;
1126         b1[i] += (r+W_CO)>>W_CS;
1127 #endif
1128     }
1129 }
1130
1131 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1132     int i;
1133
1134     for(i=0; i<width; i++){
1135 #ifdef liftS
1136         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1137 #else
1138         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
1139 #endif
1140     }
1141 }
1142
1143 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1144     int i;
1145
1146     for(i=0; i<width; i++){
1147         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1148     }
1149 }
1150
1151 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
1152     int y;
1153     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1154     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1155     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1156     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1157
1158     for(y=-4; y<height; y+=2){
1159         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1160         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1161
1162 {START_TIMER
1163         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1164         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1165 if(width>400){
1166 STOP_TIMER("horizontal_decompose97i")
1167 }}
1168
1169 {START_TIMER
1170         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1171         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1172         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1173         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1174
1175 if(width>400){
1176 STOP_TIMER("vertical_decompose97i")
1177 }}
1178
1179         b0=b2;
1180         b1=b3;
1181         b2=b4;
1182         b3=b5;
1183     }
1184 }
1185
1186 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1187     int level;
1188
1189     for(level=0; level<decomposition_count; level++){
1190         switch(type){
1191         case 0: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1192         case 1: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1193         case 2: spatial_decomposeX  (buffer, width>>level, height>>level, stride<<level); break;
1194         }
1195     }
1196 }
1197
1198 static void horizontal_compose53i(DWTELEM *b, int width){
1199     DWTELEM temp[width];
1200     const int width2= width>>1;
1201     const int w2= (width+1)>>1;
1202     int x;
1203
1204 #if 0
1205     int A1,A2,A3,A4;
1206     A2= temp[1       ];
1207     A4= temp[0       ];
1208     A1= temp[0+width2];
1209     A1 -= (A2 + A4)>>1;
1210     A4 += (A1 + 1)>>1;
1211     b[0+width2] = A1;
1212     b[0       ] = A4;
1213     for(x=1; x+1<width2; x+=2){
1214         A3= temp[x+width2];
1215         A4= temp[x+1     ];
1216         A3 -= (A2 + A4)>>1;
1217         A2 += (A1 + A3 + 2)>>2;
1218         b[x+width2] = A3;
1219         b[x       ] = A2;
1220
1221         A1= temp[x+1+width2];
1222         A2= temp[x+2       ];
1223         A1 -= (A2 + A4)>>1;
1224         A4 += (A1 + A3 + 2)>>2;
1225         b[x+1+width2] = A1;
1226         b[x+1       ] = A4;
1227     }
1228     A3= temp[width-1];
1229     A3 -= A2;
1230     A2 += (A1 + A3 + 2)>>2;
1231     b[width -1] = A3;
1232     b[width2-1] = A2;
1233 #else
1234     lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1235     lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1236 #endif
1237     for(x=0; x<width2; x++){
1238         b[2*x    ]= temp[x   ];
1239         b[2*x + 1]= temp[x+w2];
1240     }
1241     if(width&1)
1242         b[2*x    ]= temp[x   ];
1243 }
1244
1245 static void vertical_compose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1246     int i;
1247
1248     for(i=0; i<width; i++){
1249         b1[i] += (b0[i] + b2[i])>>1;
1250     }
1251 }
1252
1253 static void vertical_compose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1254     int i;
1255
1256     for(i=0; i<width; i++){
1257         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1258     }
1259 }
1260
1261 static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1262     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1263     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1264     cs->y = -1;
1265 }
1266
1267 static void spatial_compose53i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1268     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1269     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1270     cs->y = -1;
1271 }
1272
1273 static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1274     int y= cs->y;
1275
1276     DWTELEM *b0= cs->b0;
1277     DWTELEM *b1= cs->b1;
1278     DWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1279     DWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1280
1281 {START_TIMER
1282         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1283         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1284 STOP_TIMER("vertical_compose53i*")}
1285
1286 {START_TIMER
1287         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1288         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1289 STOP_TIMER("horizontal_compose53i")}
1290
1291     cs->b0 = b2;
1292     cs->b1 = b3;
1293     cs->y += 2;
1294 }
1295
1296 static void spatial_compose53i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1297     int y= cs->y;
1298     DWTELEM *b0= cs->b0;
1299     DWTELEM *b1= cs->b1;
1300     DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1301     DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1302
1303 {START_TIMER
1304         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1305         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1306 STOP_TIMER("vertical_compose53i*")}
1307
1308 {START_TIMER
1309         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1310         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1311 STOP_TIMER("horizontal_compose53i")}
1312
1313     cs->b0 = b2;
1314     cs->b1 = b3;
1315     cs->y += 2;
1316 }
1317
1318 static void spatial_compose53i(DWTELEM *buffer, int width, int height, int stride){
1319     dwt_compose_t cs;
1320     spatial_compose53i_init(&cs, buffer, height, stride);
1321     while(cs.y <= height)
1322         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1323 }
1324
1325
1326 void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
1327     DWTELEM temp[width];
1328     const int w2= (width+1)>>1;
1329
1330     lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1331     lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1332     liftS(b      , temp   , temp+w2, 2, 1, 1, width, -W_BM, W_BO, W_BS, 0, 1);
1333     lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
1334 }
1335
1336 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1337     int i;
1338
1339     for(i=0; i<width; i++){
1340         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1341     }
1342 }
1343
1344 static void vertical_compose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1345     int i;
1346
1347     for(i=0; i<width; i++){
1348 #ifdef lift5
1349         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1350 #else
1351         int r= 3*(b0[i] + b2[i]);
1352         r+= r>>4;
1353         r+= r>>8;
1354         b1[i] -= (r+W_CO)>>W_CS;
1355 #endif
1356     }
1357 }
1358
1359 static void vertical_compose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1360     int i;
1361
1362     for(i=0; i<width; i++){
1363 #ifdef liftS
1364         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1365 #else
1366         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1367 #endif
1368     }
1369 }
1370
1371 static void vertical_compose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
1372     int i;
1373
1374     for(i=0; i<width; i++){
1375         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1376     }
1377 }
1378
1379 void ff_snow_vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
1380     int i;
1381
1382     for(i=0; i<width; i++){
1383 #ifndef lift5
1384         int r;
1385 #endif
1386         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1387 #ifdef lift5
1388         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1389 #else
1390         r= 3*(b2[i] + b4[i]);
1391         r+= r>>4;
1392         r+= r>>8;
1393         b3[i] -= (r+W_CO)>>W_CS;
1394 #endif
1395 #ifdef liftS
1396         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1397 #else
1398         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1399 #endif
1400         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1401     }
1402 }
1403
1404 static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1405     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1406     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1407     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1408     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1409     cs->y = -3;
1410 }
1411
1412 static void spatial_compose97i_init(dwt_compose_t *cs, DWTELEM *buffer, int height, int stride){
1413     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1414     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1415     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1416     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1417     cs->y = -3;
1418 }
1419
1420 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1421     int y = cs->y;
1422
1423     DWTELEM *b0= cs->b0;
1424     DWTELEM *b1= cs->b1;
1425     DWTELEM *b2= cs->b2;
1426     DWTELEM *b3= cs->b3;
1427     DWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1428     DWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1429
1430 {START_TIMER
1431     if(y>0 && y+4<height){
1432         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1433     }else{
1434         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1435         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1436         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1437         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1438     }
1439 if(width>400){
1440 STOP_TIMER("vertical_compose97i")}}
1441
1442 {START_TIMER
1443         if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1444         if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1445 if(width>400 && y+0<(unsigned)height){
1446 STOP_TIMER("horizontal_compose97i")}}
1447
1448     cs->b0=b2;
1449     cs->b1=b3;
1450     cs->b2=b4;
1451     cs->b3=b5;
1452     cs->y += 2;
1453 }
1454
1455 static void spatial_compose97i_dy(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride){
1456     int y = cs->y;
1457     DWTELEM *b0= cs->b0;
1458     DWTELEM *b1= cs->b1;
1459     DWTELEM *b2= cs->b2;
1460     DWTELEM *b3= cs->b3;
1461     DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1462     DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1463
1464 {START_TIMER
1465         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1466         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1467         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1468         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1469 if(width>400){
1470 STOP_TIMER("vertical_compose97i")}}
1471
1472 {START_TIMER
1473         if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1474         if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1475 if(width>400 && b0 <= b2){
1476 STOP_TIMER("horizontal_compose97i")}}
1477
1478     cs->b0=b2;
1479     cs->b1=b3;
1480     cs->b2=b4;
1481     cs->b3=b5;
1482     cs->y += 2;
1483 }
1484
1485 static void spatial_compose97i(DWTELEM *buffer, int width, int height, int stride){
1486     dwt_compose_t cs;
1487     spatial_compose97i_init(&cs, buffer, height, stride);
1488     while(cs.y <= height)
1489         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1490 }
1491
1492 static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1493     int level;
1494     for(level=decomposition_count-1; level>=0; level--){
1495         switch(type){
1496         case 0: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1497         case 1: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1498         /* not slicified yet */
1499         case 2: /*spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;*/
1500           av_log(NULL, AV_LOG_ERROR, "spatial_composeX neither buffered nor slicified yet.\n"); break;
1501         }
1502     }
1503 }
1504
1505 static void ff_spatial_idwt_init(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1506     int level;
1507     for(level=decomposition_count-1; level>=0; level--){
1508         switch(type){
1509         case 0: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1510         case 1: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1511         /* not slicified yet */
1512         case 2: spatial_composeX(buffer, width>>level, height>>level, stride<<level); break;
1513         }
1514     }
1515 }
1516
1517 static void ff_spatial_idwt_slice(dwt_compose_t *cs, DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1518     const int support = type==1 ? 3 : 5;
1519     int level;
1520     if(type==2) return;
1521
1522     for(level=decomposition_count-1; level>=0; level--){
1523         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1524             switch(type){
1525             case 0: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1526                     break;
1527             case 1: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1528                     break;
1529             case 2: break;
1530             }
1531         }
1532     }
1533 }
1534
1535 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1536     const int support = type==1 ? 3 : 5;
1537     int level;
1538     if(type==2) return;
1539
1540     for(level=decomposition_count-1; level>=0; level--){
1541         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1542             switch(type){
1543             case 0: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1544                     break;
1545             case 1: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1546                     break;
1547             case 2: break;
1548             }
1549         }
1550     }
1551 }
1552
1553 static void ff_spatial_idwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1554     if(type==2){
1555         int level;
1556         for(level=decomposition_count-1; level>=0; level--)
1557             spatial_composeX  (buffer, width>>level, height>>level, stride<<level);
1558     }else{
1559         dwt_compose_t cs[MAX_DECOMPOSITIONS];
1560         int y;
1561         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1562         for(y=0; y<height; y+=4)
1563             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1564     }
1565 }
1566
1567 static int encode_subband_c0run(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1568     const int w= b->width;
1569     const int h= b->height;
1570     int x, y;
1571
1572     if(1){
1573         int run=0;
1574         int runs[w*h];
1575         int run_index=0;
1576         int max_index;
1577
1578         for(y=0; y<h; y++){
1579             for(x=0; x<w; x++){
1580                 int v, p=0;
1581                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1582                 v= src[x + y*stride];
1583
1584                 if(y){
1585                     t= src[x + (y-1)*stride];
1586                     if(x){
1587                         lt= src[x - 1 + (y-1)*stride];
1588                     }
1589                     if(x + 1 < w){
1590                         rt= src[x + 1 + (y-1)*stride];
1591                     }
1592                 }
1593                 if(x){
1594                     l= src[x - 1 + y*stride];
1595                     /*if(x > 1){
1596                         if(orientation==1) ll= src[y + (x-2)*stride];
1597                         else               ll= src[x - 2 + y*stride];
1598                     }*/
1599                 }
1600                 if(parent){
1601                     int px= x>>1;
1602                     int py= y>>1;
1603                     if(px<b->parent->width && py<b->parent->height)
1604                         p= parent[px + py*2*stride];
1605                 }
1606                 if(!(/*ll|*/l|lt|t|rt|p)){
1607                     if(v){
1608                         runs[run_index++]= run;
1609                         run=0;
1610                     }else{
1611                         run++;
1612                     }
1613                 }
1614             }
1615         }
1616         max_index= run_index;
1617         runs[run_index++]= run;
1618         run_index=0;
1619         run= runs[run_index++];
1620
1621         put_symbol2(&s->c, b->state[30], max_index, 0);
1622         if(run_index <= max_index)
1623             put_symbol2(&s->c, b->state[1], run, 3);
1624
1625         for(y=0; y<h; y++){
1626             if(s->c.bytestream_end - s->c.bytestream < w*40){
1627                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1628                 return -1;
1629             }
1630             for(x=0; x<w; x++){
1631                 int v, p=0;
1632                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1633                 v= src[x + y*stride];
1634
1635                 if(y){
1636                     t= src[x + (y-1)*stride];
1637                     if(x){
1638                         lt= src[x - 1 + (y-1)*stride];
1639                     }
1640                     if(x + 1 < w){
1641                         rt= src[x + 1 + (y-1)*stride];
1642                     }
1643                 }
1644                 if(x){
1645                     l= src[x - 1 + y*stride];
1646                     /*if(x > 1){
1647                         if(orientation==1) ll= src[y + (x-2)*stride];
1648                         else               ll= src[x - 2 + y*stride];
1649                     }*/
1650                 }
1651                 if(parent){
1652                     int px= x>>1;
1653                     int py= y>>1;
1654                     if(px<b->parent->width && py<b->parent->height)
1655                         p= parent[px + py*2*stride];
1656                 }
1657                 if(/*ll|*/l|lt|t|rt|p){
1658                     int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1659
1660                     put_rac(&s->c, &b->state[0][context], !!v);
1661                 }else{
1662                     if(!run){
1663                         run= runs[run_index++];
1664
1665                         if(run_index <= max_index)
1666                             put_symbol2(&s->c, b->state[1], run, 3);
1667                         assert(v);
1668                     }else{
1669                         run--;
1670                         assert(!v);
1671                     }
1672                 }
1673                 if(v){
1674                     int context= av_log2(/*ABS(ll) + */3*ABS(l) + ABS(lt) + 2*ABS(t) + ABS(rt) + ABS(p));
1675                     int l2= 2*ABS(l) + (l<0);
1676                     int t2= 2*ABS(t) + (t<0);
1677
1678                     put_symbol2(&s->c, b->state[context + 2], ABS(v)-1, context-4);
1679                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1680                 }
1681             }
1682         }
1683     }
1684     return 0;
1685 }
1686
1687 static int encode_subband(SnowContext *s, SubBand *b, DWTELEM *src, DWTELEM *parent, int stride, int orientation){
1688 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1689 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1690     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1691 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1692 }
1693
1694 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1695     const int w= b->width;
1696     const int h= b->height;
1697     int x,y;
1698
1699     if(1){
1700         int run, runs;
1701         x_and_coeff *xc= b->x_coeff;
1702         x_and_coeff *prev_xc= NULL;
1703         x_and_coeff *prev2_xc= xc;
1704         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1705         x_and_coeff *prev_parent_xc= parent_xc;
1706
1707         runs= get_symbol2(&s->c, b->state[30], 0);
1708         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1709         else           run= INT_MAX;
1710
1711         for(y=0; y<h; y++){
1712             int v=0;
1713             int lt=0, t=0, rt=0;
1714
1715             if(y && prev_xc->x == 0){
1716                 rt= prev_xc->coeff;
1717             }
1718             for(x=0; x<w; x++){
1719                 int p=0;
1720                 const int l= v;
1721
1722                 lt= t; t= rt;
1723
1724                 if(y){
1725                     if(prev_xc->x <= x)
1726                         prev_xc++;
1727                     if(prev_xc->x == x + 1)
1728                         rt= prev_xc->coeff;
1729                     else
1730                         rt=0;
1731                 }
1732                 if(parent_xc){
1733                     if(x>>1 > parent_xc->x){
1734                         parent_xc++;
1735                     }
1736                     if(x>>1 == parent_xc->x){
1737                         p= parent_xc->coeff;
1738                     }
1739                 }
1740                 if(/*ll|*/l|lt|t|rt|p){
1741                     int context= av_log2(/*ABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1742
1743                     v=get_rac(&s->c, &b->state[0][context]);
1744                     if(v){
1745                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1746                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1747
1748                         xc->x=x;
1749                         (xc++)->coeff= v;
1750                     }
1751                 }else{
1752                     if(!run){
1753                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1754                         else           run= INT_MAX;
1755                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1756                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1757
1758                         xc->x=x;
1759                         (xc++)->coeff= v;
1760                     }else{
1761                         int max_run;
1762                         run--;
1763                         v=0;
1764
1765                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1766                         else  max_run= FFMIN(run, w-x-1);
1767                         if(parent_xc)
1768                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1769                         x+= max_run;
1770                         run-= max_run;
1771                     }
1772                 }
1773             }
1774             (xc++)->x= w+1; //end marker
1775             prev_xc= prev2_xc;
1776             prev2_xc= xc;
1777
1778             if(parent_xc){
1779                 if(y&1){
1780                     while(parent_xc->x != parent->width+1)
1781                         parent_xc++;
1782                     parent_xc++;
1783                     prev_parent_xc= parent_xc;
1784                 }else{
1785                     parent_xc= prev_parent_xc;
1786                 }
1787             }
1788         }
1789
1790         (xc++)->x= w+1; //end marker
1791     }
1792 }
1793
1794 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1795     const int w= b->width;
1796     int y;
1797     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
1798     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1799     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1800     int new_index = 0;
1801
1802     START_TIMER
1803
1804     if(b->buf == s->spatial_dwt_buffer || s->qlog == LOSSLESS_QLOG){
1805         qadd= 0;
1806         qmul= 1<<QEXPSHIFT;
1807     }
1808
1809     /* If we are on the second or later slice, restore our index. */
1810     if (start_y != 0)
1811         new_index = save_state[0];
1812
1813
1814     for(y=start_y; y<h; y++){
1815         int x = 0;
1816         int v;
1817         DWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1818         memset(line, 0, b->width*sizeof(DWTELEM));
1819         v = b->x_coeff[new_index].coeff;
1820         x = b->x_coeff[new_index++].x;
1821         while(x < w)
1822         {
1823             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1824             register int u= -(v&1);
1825             line[x] = (t^u) - u;
1826
1827             v = b->x_coeff[new_index].coeff;
1828             x = b->x_coeff[new_index++].x;
1829         }
1830     }
1831     if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1832         STOP_TIMER("decode_subband")
1833     }
1834
1835     /* Save our variables for the next slice. */
1836     save_state[0] = new_index;
1837
1838     return;
1839 }
1840
1841 static void reset_contexts(SnowContext *s){
1842     int plane_index, level, orientation;
1843
1844     for(plane_index=0; plane_index<3; plane_index++){
1845         for(level=0; level<s->spatial_decomposition_count; level++){
1846             for(orientation=level ? 1:0; orientation<4; orientation++){
1847                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1848             }
1849         }
1850     }
1851     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1852     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1853 }
1854
1855 static int alloc_blocks(SnowContext *s){
1856     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1857     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1858
1859     s->b_width = w;
1860     s->b_height= h;
1861
1862     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1863     return 0;
1864 }
1865
1866 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1867     uint8_t *bytestream= d->bytestream;
1868     uint8_t *bytestream_start= d->bytestream_start;
1869     *d= *s;
1870     d->bytestream= bytestream;
1871     d->bytestream_start= bytestream_start;
1872 }
1873
1874 //near copy & paste from dsputil, FIXME
1875 static int pix_sum(uint8_t * pix, int line_size, int w)
1876 {
1877     int s, i, j;
1878
1879     s = 0;
1880     for (i = 0; i < w; i++) {
1881         for (j = 0; j < w; j++) {
1882             s += pix[0];
1883             pix ++;
1884         }
1885         pix += line_size - w;
1886     }
1887     return s;
1888 }
1889
1890 //near copy & paste from dsputil, FIXME
1891 static int pix_norm1(uint8_t * pix, int line_size, int w)
1892 {
1893     int s, i, j;
1894     uint32_t *sq = squareTbl + 256;
1895
1896     s = 0;
1897     for (i = 0; i < w; i++) {
1898         for (j = 0; j < w; j ++) {
1899             s += sq[pix[0]];
1900             pix ++;
1901         }
1902         pix += line_size - w;
1903     }
1904     return s;
1905 }
1906
1907 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int type){
1908     const int w= s->b_width << s->block_max_depth;
1909     const int rem_depth= s->block_max_depth - level;
1910     const int index= (x + y*w) << rem_depth;
1911     const int block_w= 1<<rem_depth;
1912     BlockNode block;
1913     int i,j;
1914
1915     block.color[0]= l;
1916     block.color[1]= cb;
1917     block.color[2]= cr;
1918     block.mx= mx;
1919     block.my= my;
1920     block.type= type;
1921     block.level= level;
1922
1923     for(j=0; j<block_w; j++){
1924         for(i=0; i<block_w; i++){
1925             s->block[index + i + j*w]= block;
1926         }
1927     }
1928 }
1929
1930 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1931     const int offset[3]= {
1932           y*c->  stride + x,
1933         ((y*c->uvstride + x)>>1),
1934         ((y*c->uvstride + x)>>1),
1935     };
1936     int i;
1937     for(i=0; i<3; i++){
1938         c->src[0][i]= src [i];
1939         c->ref[0][i]= ref [i] + offset[i];
1940     }
1941     assert(!ref_index);
1942 }
1943
1944 //FIXME copy&paste
1945 #define P_LEFT P[1]
1946 #define P_TOP P[2]
1947 #define P_TOPRIGHT P[3]
1948 #define P_MEDIAN P[4]
1949 #define P_MV1 P[9]
1950 #define FLAG_QPEL   1 //must be 1
1951
1952 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1953     uint8_t p_buffer[1024];
1954     uint8_t i_buffer[1024];
1955     uint8_t p_state[sizeof(s->block_state)];
1956     uint8_t i_state[sizeof(s->block_state)];
1957     RangeCoder pc, ic;
1958     uint8_t *pbbak= s->c.bytestream;
1959     uint8_t *pbbak_start= s->c.bytestream_start;
1960     int score, score2, iscore, i_len, p_len, block_s, sum;
1961     const int w= s->b_width  << s->block_max_depth;
1962     const int h= s->b_height << s->block_max_depth;
1963     const int rem_depth= s->block_max_depth - level;
1964     const int index= (x + y*w) << rem_depth;
1965     const int block_w= 1<<(LOG2_MB_SIZE - level);
1966     int trx= (x+1)<<rem_depth;
1967     int try= (y+1)<<rem_depth;
1968     BlockNode *left  = x ? &s->block[index-1] : &null_block;
1969     BlockNode *top   = y ? &s->block[index-w] : &null_block;
1970     BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1971     BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
1972     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1973     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1974     int pl = left->color[0];
1975     int pcb= left->color[1];
1976     int pcr= left->color[2];
1977     int pmx= mid_pred(left->mx, top->mx, tr->mx);
1978     int pmy= mid_pred(left->my, top->my, tr->my);
1979     int mx=0, my=0;
1980     int l,cr,cb;
1981     const int stride= s->current_picture.linesize[0];
1982     const int uvstride= s->current_picture.linesize[1];
1983     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
1984                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
1985                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
1986     int P[10][2];
1987     int16_t last_mv[3][2];
1988     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
1989     const int shift= 1+qpel;
1990     MotionEstContext *c= &s->m.me;
1991     int mx_context= av_log2(2*ABS(left->mx - top->mx));
1992     int my_context= av_log2(2*ABS(left->my - top->my));
1993     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
1994
1995     assert(sizeof(s->block_state) >= 256);
1996     if(s->keyframe){
1997         set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
1998         return 0;
1999     }
2000
2001 //    clip predictors / edge ?
2002
2003     P_LEFT[0]= left->mx;
2004     P_LEFT[1]= left->my;
2005     P_TOP [0]= top->mx;
2006     P_TOP [1]= top->my;
2007     P_TOPRIGHT[0]= tr->mx;
2008     P_TOPRIGHT[1]= tr->my;
2009
2010     last_mv[0][0]= s->block[index].mx;
2011     last_mv[0][1]= s->block[index].my;
2012     last_mv[1][0]= right->mx;
2013     last_mv[1][1]= right->my;
2014     last_mv[2][0]= bottom->mx;
2015     last_mv[2][1]= bottom->my;
2016
2017     s->m.mb_stride=2;
2018     s->m.mb_x=
2019     s->m.mb_y= 0;
2020     s->m.me.skip= 0;
2021
2022     init_ref(c, current_data, s->last_picture.data, NULL, block_w*x, block_w*y, 0);
2023
2024     assert(s->m.me.  stride ==   stride);
2025     assert(s->m.me.uvstride == uvstride);
2026
2027     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
2028     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
2029     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
2030     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
2031
2032     c->xmin = - x*block_w - 16+2;
2033     c->ymin = - y*block_w - 16+2;
2034     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2035     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
2036
2037     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
2038     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
2039     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
2040     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
2041     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
2042     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
2043     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
2044
2045     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
2046     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
2047
2048     if (!y) {
2049         c->pred_x= P_LEFT[0];
2050         c->pred_y= P_LEFT[1];
2051     } else {
2052         c->pred_x = P_MEDIAN[0];
2053         c->pred_y = P_MEDIAN[1];
2054     }
2055
2056     score= ff_epzs_motion_search(&s->m, &mx, &my, P, 0, /*ref_index*/ 0, last_mv,
2057                              (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
2058
2059     assert(mx >= c->xmin);
2060     assert(mx <= c->xmax);
2061     assert(my >= c->ymin);
2062     assert(my <= c->ymax);
2063
2064     score= s->m.me.sub_motion_search(&s->m, &mx, &my, score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
2065     score= ff_get_mb_score(&s->m, mx, my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
2066     //FIXME if mb_cmp != SSE then intra cant be compared currently and mb_penalty vs. lambda2
2067
2068   //  subpel search
2069     pc= s->c;
2070     pc.bytestream_start=
2071     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
2072     memcpy(p_state, s->block_state, sizeof(s->block_state));
2073
2074     if(level!=s->block_max_depth)
2075         put_rac(&pc, &p_state[4 + s_context], 1);
2076     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
2077     put_symbol(&pc, &p_state[128 + 32*mx_context], mx - pmx, 1);
2078     put_symbol(&pc, &p_state[128 + 32*my_context], my - pmy, 1);
2079     p_len= pc.bytestream - pc.bytestream_start;
2080     score += (s->lambda2*(p_len*8
2081               + (pc.outstanding_count - s->c.outstanding_count)*8
2082               + (-av_log2(pc.range)    + av_log2(s->c.range))
2083              ))>>FF_LAMBDA_SHIFT;
2084
2085     block_s= block_w*block_w;
2086     sum = pix_sum(current_data[0], stride, block_w);
2087     l= (sum + block_s/2)/block_s;
2088     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
2089
2090     block_s= block_w*block_w>>2;
2091     sum = pix_sum(current_data[1], uvstride, block_w>>1);
2092     cb= (sum + block_s/2)/block_s;
2093 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
2094     sum = pix_sum(current_data[2], uvstride, block_w>>1);
2095     cr= (sum + block_s/2)/block_s;
2096 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
2097
2098     ic= s->c;
2099     ic.bytestream_start=
2100     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
2101     memcpy(i_state, s->block_state, sizeof(s->block_state));
2102     if(level!=s->block_max_depth)
2103         put_rac(&ic, &i_state[4 + s_context], 1);
2104     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
2105     put_symbol(&ic, &i_state[32],  l-pl , 1);
2106     put_symbol(&ic, &i_state[64], cb-pcb, 1);
2107     put_symbol(&ic, &i_state[96], cr-pcr, 1);
2108     i_len= ic.bytestream - ic.bytestream_start;
2109     iscore += (s->lambda2*(i_len*8
2110               + (ic.outstanding_count - s->c.outstanding_count)*8
2111               + (-av_log2(ic.range)    + av_log2(s->c.range))
2112              ))>>FF_LAMBDA_SHIFT;
2113
2114 //    assert(score==256*256*256*64-1);
2115     assert(iscore < 255*255*256 + s->lambda2*10);
2116     assert(iscore >= 0);
2117     assert(l>=0 && l<=255);
2118     assert(pl>=0 && pl<=255);
2119
2120     if(level==0){
2121         int varc= iscore >> 8;
2122         int vard= score >> 8;
2123         if (vard <= 64 || vard < varc)
2124             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
2125         else
2126             c->scene_change_score+= s->m.qscale;
2127     }
2128
2129     if(level!=s->block_max_depth){
2130         put_rac(&s->c, &s->block_state[4 + s_context], 0);
2131         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
2132         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
2133         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
2134         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
2135         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
2136
2137         if(score2 < score && score2 < iscore)
2138             return score2;
2139     }
2140
2141     if(iscore < score){
2142         memcpy(pbbak, i_buffer, i_len);
2143         s->c= ic;
2144         s->c.bytestream_start= pbbak_start;
2145         s->c.bytestream= pbbak + i_len;
2146         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, BLOCK_INTRA);
2147         memcpy(s->block_state, i_state, sizeof(s->block_state));
2148         return iscore;
2149     }else{
2150         memcpy(pbbak, p_buffer, p_len);
2151         s->c= pc;
2152         s->c.bytestream_start= pbbak_start;
2153         s->c.bytestream= pbbak + p_len;
2154         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, 0);
2155         memcpy(s->block_state, p_state, sizeof(s->block_state));
2156         return score;
2157     }
2158 }
2159
2160 static always_inline int same_block(BlockNode *a, BlockNode *b){
2161     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2162         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2163     }else{
2164         return !((a->mx - b->mx) | (a->my - b->my) | ((a->type ^ b->type)&BLOCK_INTRA));
2165     }
2166 }
2167
2168 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2169     const int w= s->b_width  << s->block_max_depth;
2170     const int rem_depth= s->block_max_depth - level;
2171     const int index= (x + y*w) << rem_depth;
2172     int trx= (x+1)<<rem_depth;
2173     BlockNode *b= &s->block[index];
2174     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2175     BlockNode *top   = y ? &s->block[index-w] : &null_block;
2176     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2177     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2178     int pl = left->color[0];
2179     int pcb= left->color[1];
2180     int pcr= left->color[2];
2181     int pmx= mid_pred(left->mx, top->mx, tr->mx);
2182     int pmy= mid_pred(left->my, top->my, tr->my);
2183     int mx_context= av_log2(2*ABS(left->mx - top->mx));
2184     int my_context= av_log2(2*ABS(left->my - top->my));
2185     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2186
2187     if(s->keyframe){
2188         set_blocks(s, level, x, y, pl, pcb, pcr, pmx, pmy, BLOCK_INTRA);
2189         return;
2190     }
2191
2192     if(level!=s->block_max_depth){
2193         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2194             put_rac(&s->c, &s->block_state[4 + s_context], 1);
2195         }else{
2196             put_rac(&s->c, &s->block_state[4 + s_context], 0);
2197             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2198             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2199             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2200             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2201             return;
2202         }
2203     }
2204     if(b->type & BLOCK_INTRA){
2205         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2206         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2207         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2208         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2209         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, BLOCK_INTRA);
2210     }else{
2211         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2212         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2213         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2214         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, 0);
2215     }
2216 }
2217
2218 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2219     const int w= s->b_width << s->block_max_depth;
2220     const int rem_depth= s->block_max_depth - level;
2221     const int index= (x + y*w) << rem_depth;
2222     int trx= (x+1)<<rem_depth;
2223     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2224     BlockNode *top   = y ? &s->block[index-w] : &null_block;
2225     BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2226     BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2227     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2228
2229     if(s->keyframe){
2230         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, BLOCK_INTRA);
2231         return;
2232     }
2233
2234     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2235         int type;
2236         int l = left->color[0];
2237         int cb= left->color[1];
2238         int cr= left->color[2];
2239         int mx= mid_pred(left->mx, top->mx, tr->mx);
2240         int my= mid_pred(left->my, top->my, tr->my);
2241         int mx_context= av_log2(2*ABS(left->mx - top->mx)) + 0*av_log2(2*ABS(tr->mx - top->mx));
2242         int my_context= av_log2(2*ABS(left->my - top->my)) + 0*av_log2(2*ABS(tr->my - top->my));
2243
2244         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2245
2246         if(type){
2247             l += get_symbol(&s->c, &s->block_state[32], 1);
2248             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2249             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2250         }else{
2251             mx+= get_symbol(&s->c, &s->block_state[128 + 32*mx_context], 1);
2252             my+= get_symbol(&s->c, &s->block_state[128 + 32*my_context], 1);
2253         }
2254         set_blocks(s, level, x, y, l, cb, cr, mx, my, type);
2255     }else{
2256         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2257         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2258         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2259         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2260     }
2261 }
2262
2263 static void encode_blocks(SnowContext *s){
2264     int x, y;
2265     int w= s->b_width;
2266     int h= s->b_height;
2267
2268     if(s->avctx->me_method == ME_ITER && !s->keyframe)
2269         iterative_me(s);
2270
2271     for(y=0; y<h; y++){
2272         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2273             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2274             return;
2275         }
2276         for(x=0; x<w; x++){
2277             if(s->avctx->me_method == ME_ITER)
2278                 encode_q_branch2(s, 0, x, y);
2279             else
2280                 encode_q_branch (s, 0, x, y);
2281         }
2282     }
2283 }
2284
2285 static void decode_blocks(SnowContext *s){
2286     int x, y;
2287     int w= s->b_width;
2288     int h= s->b_height;
2289
2290     for(y=0; y<h; y++){
2291         for(x=0; x<w; x++){
2292             decode_q_branch(s, 0, x, y);
2293         }
2294     }
2295 }
2296
2297 static void mc_block(uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2298     int x, y;
2299 START_TIMER
2300     for(y=0; y < b_h+5; y++){
2301         for(x=0; x < b_w; x++){
2302             int a0= src[x    ];
2303             int a1= src[x + 1];
2304             int a2= src[x + 2];
2305             int a3= src[x + 3];
2306             int a4= src[x + 4];
2307             int a5= src[x + 5];
2308 //            int am= 9*(a1+a2) - (a0+a3);
2309             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2310 //            int am= 18*(a2+a3) - 2*(a1+a4);
2311 //             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2312 //             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2313
2314 //            if(b_w==16) am= 8*(a1+a2);
2315
2316             if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2317             else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2318
2319             /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2320             if(am&(~255)) am= ~(am>>31);
2321
2322             tmp[x] = am;
2323
2324 /*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2325             else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2326             else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2327             else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2328         }
2329         tmp += stride;
2330         src += stride;
2331     }
2332     tmp -= (b_h+5)*stride;
2333
2334     for(y=0; y < b_h; y++){
2335         for(x=0; x < b_w; x++){
2336             int a0= tmp[x + 0*stride];
2337             int a1= tmp[x + 1*stride];
2338             int a2= tmp[x + 2*stride];
2339             int a3= tmp[x + 3*stride];
2340             int a4= tmp[x + 4*stride];
2341             int a5= tmp[x + 5*stride];
2342             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2343 //            int am= 18*(a2+a3) - 2*(a1+a4);
2344 /*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2345             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2346
2347 //            if(b_w==16) am= 8*(a1+a2);
2348
2349             if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2350             else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2351
2352             if(am&(~255)) am= ~(am>>31);
2353
2354             dst[x] = am;
2355 /*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2356             else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2357             else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2358             else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2359         }
2360         dst += stride;
2361         tmp += stride;
2362     }
2363 STOP_TIMER("mc_block")
2364 }
2365
2366 #define mca(dx,dy,b_w)\
2367 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, uint8_t *src, int stride, int h){\
2368     uint8_t tmp[stride*(b_w+5)];\
2369     assert(h==b_w);\
2370     mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2371 }
2372
2373 mca( 0, 0,16)
2374 mca( 8, 0,16)
2375 mca( 0, 8,16)
2376 mca( 8, 8,16)
2377 mca( 0, 0,8)
2378 mca( 8, 0,8)
2379 mca( 0, 8,8)
2380 mca( 8, 8,8)
2381
2382 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *src, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2383     if(block->type & BLOCK_INTRA){
2384         int x, y;
2385         const int color = block->color[plane_index];
2386         const int color4= color*0x01010101;
2387         if(b_w==32){
2388             for(y=0; y < b_h; y++){
2389                 *(uint32_t*)&dst[0 + y*stride]= color4;
2390                 *(uint32_t*)&dst[4 + y*stride]= color4;
2391                 *(uint32_t*)&dst[8 + y*stride]= color4;
2392                 *(uint32_t*)&dst[12+ y*stride]= color4;
2393                 *(uint32_t*)&dst[16+ y*stride]= color4;
2394                 *(uint32_t*)&dst[20+ y*stride]= color4;
2395                 *(uint32_t*)&dst[24+ y*stride]= color4;
2396                 *(uint32_t*)&dst[28+ y*stride]= color4;
2397             }
2398         }else if(b_w==16){
2399             for(y=0; y < b_h; y++){
2400                 *(uint32_t*)&dst[0 + y*stride]= color4;
2401                 *(uint32_t*)&dst[4 + y*stride]= color4;
2402                 *(uint32_t*)&dst[8 + y*stride]= color4;
2403                 *(uint32_t*)&dst[12+ y*stride]= color4;
2404             }
2405         }else if(b_w==8){
2406             for(y=0; y < b_h; y++){
2407                 *(uint32_t*)&dst[0 + y*stride]= color4;
2408                 *(uint32_t*)&dst[4 + y*stride]= color4;
2409             }
2410         }else if(b_w==4){
2411             for(y=0; y < b_h; y++){
2412                 *(uint32_t*)&dst[0 + y*stride]= color4;
2413             }
2414         }else{
2415             for(y=0; y < b_h; y++){
2416                 for(x=0; x < b_w; x++){
2417                     dst[x + y*stride]= color;
2418                 }
2419             }
2420         }
2421     }else{
2422         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2423         int mx= block->mx*scale;
2424         int my= block->my*scale;
2425         const int dx= mx&15;
2426         const int dy= my&15;
2427         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2428         sx += (mx>>4) - 2;
2429         sy += (my>>4) - 2;
2430         src += sx + sy*stride;
2431         if(   (unsigned)sx >= w - b_w - 4
2432            || (unsigned)sy >= h - b_h - 4){
2433             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2434             src= tmp + MB_SIZE;
2435         }
2436 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2437 //        assert(!(b_w&(b_w-1)));
2438         assert(b_w>1 && b_h>1);
2439         assert(tab_index>=0 && tab_index<4 || b_w==32);
2440         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2441             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2442         else if(b_w==32){
2443             int y;
2444             for(y=0; y<b_h; y+=16){
2445                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2446                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2447             }
2448         }else if(b_w==b_h)
2449             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2450         else if(b_w==2*b_h){
2451             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2452             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2453         }else{
2454             assert(2*b_w==b_h);
2455             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2456             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2457         }
2458     }
2459 }
2460
2461 void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2462                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2463     int y, x;
2464     DWTELEM * dst;
2465     for(y=0; y<b_h; y++){
2466         //FIXME ugly missue of obmc_stride
2467         uint8_t *obmc1= obmc + y*obmc_stride;
2468         uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2469         uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2470         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2471         dst = slice_buffer_get_line(sb, src_y + y);
2472         for(x=0; x<b_w; x++){
2473             int v=   obmc1[x] * block[3][x + y*src_stride]
2474                     +obmc2[x] * block[2][x + y*src_stride]
2475                     +obmc3[x] * block[1][x + y*src_stride]
2476                     +obmc4[x] * block[0][x + y*src_stride];
2477
2478             v <<= 8 - LOG2_OBMC_MAX;
2479             if(FRAC_BITS != 8){
2480                 v += 1<<(7 - FRAC_BITS);
2481                 v >>= 8 - FRAC_BITS;
2482             }
2483             if(add){
2484                 v += dst[x + src_x];
2485                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2486                 if(v&(~255)) v= ~(v>>31);
2487                 dst8[x + y*src_stride] = v;
2488             }else{
2489                 dst[x + src_x] -= v;
2490             }
2491         }
2492     }
2493 }
2494
2495 //FIXME name clenup (b_w, block_w, b_width stuff)
2496 static always_inline void add_yblock_buffered(SnowContext *s, slice_buffer * sb, DWTELEM *old_dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int plane_index){
2497     DWTELEM * dst = NULL;
2498     const int b_width = s->b_width  << s->block_max_depth;
2499     const int b_height= s->b_height << s->block_max_depth;
2500     const int b_stride= b_width;
2501     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2502     BlockNode *rt= lt+1;
2503     BlockNode *lb= lt+b_stride;
2504     BlockNode *rb= lb+1;
2505     uint8_t *block[4];
2506     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2507     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2508     uint8_t *ptmp;
2509     int x,y;
2510
2511     if(b_x<0){
2512         lt= rt;
2513         lb= rb;
2514     }else if(b_x + 1 >= b_width){
2515         rt= lt;
2516         rb= lb;
2517     }
2518     if(b_y<0){
2519         lt= lb;
2520         rt= rb;
2521     }else if(b_y + 1 >= b_height){
2522         lb= lt;
2523         rb= rt;
2524     }
2525
2526     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2527         obmc -= src_x;
2528         b_w += src_x;
2529         src_x=0;
2530     }else if(src_x + b_w > w){
2531         b_w = w - src_x;
2532     }
2533     if(src_y<0){
2534         obmc -= src_y*obmc_stride;
2535         b_h += src_y;
2536         src_y=0;
2537     }else if(src_y + b_h> h){
2538         b_h = h - src_y;
2539     }
2540
2541     if(b_w<=0 || b_h<=0) return;
2542
2543 assert(src_stride > 2*MB_SIZE + 5);
2544 //    old_dst += src_x + src_y*dst_stride;
2545     dst8+= src_x + src_y*src_stride;
2546 //    src += src_x + src_y*src_stride;
2547
2548     ptmp= tmp + 3*tmp_step;
2549     block[0]= ptmp;
2550     ptmp+=tmp_step;
2551     pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2552
2553     if(same_block(lt, rt)){
2554         block[1]= block[0];
2555     }else{
2556         block[1]= ptmp;
2557         ptmp+=tmp_step;
2558         pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2559     }
2560
2561     if(same_block(lt, lb)){
2562         block[2]= block[0];
2563     }else if(same_block(rt, lb)){
2564         block[2]= block[1];
2565     }else{
2566         block[2]= ptmp;
2567         ptmp+=tmp_step;
2568         pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2569     }
2570
2571     if(same_block(lt, rb) ){
2572         block[3]= block[0];
2573     }else if(same_block(rt, rb)){
2574         block[3]= block[1];
2575     }else if(same_block(lb, rb)){
2576         block[3]= block[2];
2577     }else{
2578         block[3]= ptmp;
2579         pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2580     }
2581 #if 0
2582     for(y=0; y<b_h; y++){
2583         for(x=0; x<b_w; x++){
2584             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2585             if(add) dst[x + y*dst_stride] += v;
2586             else    dst[x + y*dst_stride] -= v;
2587         }
2588     }
2589     for(y=0; y<b_h; y++){
2590         uint8_t *obmc2= obmc + (obmc_stride>>1);
2591         for(x=0; x<b_w; x++){
2592             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2593             if(add) dst[x + y*dst_stride] += v;
2594             else    dst[x + y*dst_stride] -= v;
2595         }
2596     }
2597     for(y=0; y<b_h; y++){
2598         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2599         for(x=0; x<b_w; x++){
2600             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2601             if(add) dst[x + y*dst_stride] += v;
2602             else    dst[x + y*dst_stride] -= v;
2603         }
2604     }
2605     for(y=0; y<b_h; y++){
2606         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2607         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2608         for(x=0; x<b_w; x++){
2609             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2610             if(add) dst[x + y*dst_stride] += v;
2611             else    dst[x + y*dst_stride] -= v;
2612         }
2613     }
2614 #else
2615 {
2616
2617     START_TIMER
2618
2619     s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2620         STOP_TIMER("Inner add y block")
2621 }
2622 #endif
2623 }
2624
2625 //FIXME name clenup (b_w, block_w, b_width stuff)
2626 static always_inline void add_yblock(SnowContext *s, DWTELEM *dst, uint8_t *dst8, uint8_t *src, uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2627     const int b_width = s->b_width  << s->block_max_depth;
2628     const int b_height= s->b_height << s->block_max_depth;
2629     const int b_stride= b_width;
2630     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2631     BlockNode *rt= lt+1;
2632     BlockNode *lb= lt+b_stride;
2633     BlockNode *rb= lb+1;
2634     uint8_t *block[4];
2635     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2636     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2637     uint8_t *ptmp;
2638     int x,y;
2639
2640     if(b_x<0){
2641         lt= rt;
2642         lb= rb;
2643     }else if(b_x + 1 >= b_width){
2644         rt= lt;
2645         rb= lb;
2646     }
2647     if(b_y<0){
2648         lt= lb;
2649         rt= rb;
2650     }else if(b_y + 1 >= b_height){
2651         lb= lt;
2652         rb= rt;
2653     }
2654
2655     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2656         obmc -= src_x;
2657         b_w += src_x;
2658         if(!offset_dst)
2659             dst -= src_x;
2660         src_x=0;
2661     }else if(src_x + b_w > w){
2662         b_w = w - src_x;
2663     }
2664     if(src_y<0){
2665         obmc -= src_y*obmc_stride;
2666         b_h += src_y;
2667         if(!offset_dst)
2668             dst -= src_y*dst_stride;
2669         src_y=0;
2670     }else if(src_y + b_h> h){
2671         b_h = h - src_y;
2672     }
2673
2674     if(b_w<=0 || b_h<=0) return;
2675
2676 assert(src_stride > 2*MB_SIZE + 5);
2677     if(offset_dst)
2678         dst += src_x + src_y*dst_stride;
2679     dst8+= src_x + src_y*src_stride;
2680 //    src += src_x + src_y*src_stride;
2681
2682     ptmp= tmp + 3*tmp_step;
2683     block[0]= ptmp;
2684     ptmp+=tmp_step;
2685     pred_block(s, block[0], src, tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2686
2687     if(same_block(lt, rt)){
2688         block[1]= block[0];
2689     }else{
2690         block[1]= ptmp;
2691         ptmp+=tmp_step;
2692         pred_block(s, block[1], src, tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2693     }
2694
2695     if(same_block(lt, lb)){
2696         block[2]= block[0];
2697     }else if(same_block(rt, lb)){
2698         block[2]= block[1];
2699     }else{
2700         block[2]= ptmp;
2701         ptmp+=tmp_step;
2702         pred_block(s, block[2], src, tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2703     }
2704
2705     if(same_block(lt, rb) ){
2706         block[3]= block[0];
2707     }else if(same_block(rt, rb)){
2708         block[3]= block[1];
2709     }else if(same_block(lb, rb)){
2710         block[3]= block[2];
2711     }else{
2712         block[3]= ptmp;
2713         pred_block(s, block[3], src, tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2714     }
2715 #if 0
2716     for(y=0; y<b_h; y++){
2717         for(x=0; x<b_w; x++){
2718             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2719             if(add) dst[x + y*dst_stride] += v;
2720             else    dst[x + y*dst_stride] -= v;
2721         }
2722     }
2723     for(y=0; y<b_h; y++){
2724         uint8_t *obmc2= obmc + (obmc_stride>>1);
2725         for(x=0; x<b_w; x++){
2726             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2727             if(add) dst[x + y*dst_stride] += v;
2728             else    dst[x + y*dst_stride] -= v;
2729         }
2730     }
2731     for(y=0; y<b_h; y++){
2732         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2733         for(x=0; x<b_w; x++){
2734             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2735             if(add) dst[x + y*dst_stride] += v;
2736             else    dst[x + y*dst_stride] -= v;
2737         }
2738     }
2739     for(y=0; y<b_h; y++){
2740         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2741         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2742         for(x=0; x<b_w; x++){
2743             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2744             if(add) dst[x + y*dst_stride] += v;
2745             else    dst[x + y*dst_stride] -= v;
2746         }
2747     }
2748 #else
2749     for(y=0; y<b_h; y++){
2750         //FIXME ugly missue of obmc_stride
2751         uint8_t *obmc1= obmc + y*obmc_stride;
2752         uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2753         uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2754         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2755         for(x=0; x<b_w; x++){
2756             int v=   obmc1[x] * block[3][x + y*src_stride]
2757                     +obmc2[x] * block[2][x + y*src_stride]
2758                     +obmc3[x] * block[1][x + y*src_stride]
2759                     +obmc4[x] * block[0][x + y*src_stride];
2760
2761             v <<= 8 - LOG2_OBMC_MAX;
2762             if(FRAC_BITS != 8){
2763                 v += 1<<(7 - FRAC_BITS);
2764                 v >>= 8 - FRAC_BITS;
2765             }
2766             if(add){
2767                 v += dst[x + y*dst_stride];
2768                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2769                 if(v&(~255)) v= ~(v>>31);
2770                 dst8[x + y*src_stride] = v;
2771             }else{
2772                 dst[x + y*dst_stride] -= v;
2773             }
2774         }
2775     }
2776 #endif
2777 }
2778
2779 static always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, DWTELEM * old_buffer, int plane_index, int add, int mb_y){
2780     Plane *p= &s->plane[plane_index];
2781     const int mb_w= s->b_width  << s->block_max_depth;
2782     const int mb_h= s->b_height << s->block_max_depth;
2783     int x, y, mb_x;
2784     int block_size = MB_SIZE >> s->block_max_depth;
2785     int block_w    = plane_index ? block_size/2 : block_size;
2786     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2787     int obmc_stride= plane_index ? block_size : 2*block_size;
2788     int ref_stride= s->current_picture.linesize[plane_index];
2789     uint8_t *ref  = s->last_picture.data[plane_index];
2790     uint8_t *dst8= s->current_picture.data[plane_index];
2791     int w= p->width;
2792     int h= p->height;
2793     START_TIMER
2794
2795     if(s->keyframe || (s->avctx->debug&512)){
2796         if(mb_y==mb_h)
2797             return;
2798
2799         if(add){
2800             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2801             {
2802 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2803                 DWTELEM * line = sb->line[y];
2804                 for(x=0; x<w; x++)
2805                 {
2806 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2807                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2808                     v >>= FRAC_BITS;
2809                     if(v&(~255)) v= ~(v>>31);
2810                     dst8[x + y*ref_stride]= v;
2811                 }
2812             }
2813         }else{
2814             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2815             {
2816 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2817                 DWTELEM * line = sb->line[y];
2818                 for(x=0; x<w; x++)
2819                 {
2820                     line[x] -= 128 << FRAC_BITS;
2821 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2822                 }
2823             }
2824         }
2825
2826         return;
2827     }
2828
2829         for(mb_x=0; mb_x<=mb_w; mb_x++){
2830             START_TIMER
2831
2832             add_yblock_buffered(s, sb, old_buffer, dst8, ref, obmc,
2833                        block_w*mb_x - block_w/2,
2834                        block_w*mb_y - block_w/2,
2835                        block_w, block_w,
2836                        w, h,
2837                        w, ref_stride, obmc_stride,
2838                        mb_x - 1, mb_y - 1,
2839                        add, plane_index);
2840
2841             STOP_TIMER("add_yblock")
2842         }
2843
2844     STOP_TIMER("predict_slice")
2845 }
2846
2847 static always_inline void predict_slice(SnowContext *s, DWTELEM *buf, int plane_index, int add, int mb_y){
2848     Plane *p= &s->plane[plane_index];
2849     const int mb_w= s->b_width  << s->block_max_depth;
2850     const int mb_h= s->b_height << s->block_max_depth;
2851     int x, y, mb_x;
2852     int block_size = MB_SIZE >> s->block_max_depth;
2853     int block_w    = plane_index ? block_size/2 : block_size;
2854     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2855     const int obmc_stride= plane_index ? block_size : 2*block_size;
2856     int ref_stride= s->current_picture.linesize[plane_index];
2857     uint8_t *ref  = s->last_picture.data[plane_index];
2858     uint8_t *dst8= s->current_picture.data[plane_index];
2859     int w= p->width;
2860     int h= p->height;
2861     START_TIMER
2862
2863     if(s->keyframe || (s->avctx->debug&512)){
2864         if(mb_y==mb_h)
2865             return;
2866
2867         if(add){
2868             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2869                 for(x=0; x<w; x++){
2870                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2871                     v >>= FRAC_BITS;
2872                     if(v&(~255)) v= ~(v>>31);
2873                     dst8[x + y*ref_stride]= v;
2874                 }
2875             }
2876         }else{
2877             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2878                 for(x=0; x<w; x++){
2879                     buf[x + y*w]-= 128<<FRAC_BITS;
2880                 }
2881             }
2882         }
2883
2884         return;
2885     }
2886
2887         for(mb_x=0; mb_x<=mb_w; mb_x++){
2888             START_TIMER
2889
2890             add_yblock(s, buf, dst8, ref, obmc,
2891                        block_w*mb_x - block_w/2,
2892                        block_w*mb_y - block_w/2,
2893                        block_w, block_w,
2894                        w, h,
2895                        w, ref_stride, obmc_stride,
2896                        mb_x - 1, mb_y - 1,
2897                        add, 1, plane_index);
2898
2899             STOP_TIMER("add_yblock")
2900         }
2901
2902     STOP_TIMER("predict_slice")
2903 }
2904
2905 static always_inline void predict_plane(SnowContext *s, DWTELEM *buf, int plane_index, int add){
2906     const int mb_h= s->b_height << s->block_max_depth;
2907     int mb_y;
2908     for(mb_y=0; mb_y<=mb_h; mb_y++)
2909         predict_slice(s, buf, plane_index, add, mb_y);
2910 }
2911
2912 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2913     int i, x2, y2;
2914     Plane *p= &s->plane[plane_index];
2915     const int block_size = MB_SIZE >> s->block_max_depth;
2916     const int block_w    = plane_index ? block_size/2 : block_size;
2917     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2918     const int obmc_stride= plane_index ? block_size : 2*block_size;
2919     const int ref_stride= s->current_picture.linesize[plane_index];
2920     uint8_t *ref= s->   last_picture.data[plane_index];
2921     uint8_t *src= s-> input_picture.data[plane_index];
2922     DWTELEM *dst= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2923     const int b_stride = s->b_width << s->block_max_depth;
2924     const int w= p->width;
2925     const int h= p->height;
2926     int index= mb_x + mb_y*b_stride;
2927     BlockNode *b= &s->block[index];
2928     BlockNode backup= *b;
2929     int ab=0;
2930     int aa=0;
2931
2932     b->type|= BLOCK_INTRA;
2933     b->color[plane_index]= 0;
2934     memset(dst, 0, obmc_stride*obmc_stride*sizeof(DWTELEM));
2935
2936     for(i=0; i<4; i++){
2937         int mb_x2= mb_x + (i &1) - 1;
2938         int mb_y2= mb_y + (i>>1) - 1;
2939         int x= block_w*mb_x2 + block_w/2;
2940         int y= block_w*mb_y2 + block_w/2;
2941
2942         add_yblock(s, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, ref, obmc,
2943                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2944
2945         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2946             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2947                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2948                 int obmc_v= obmc[index];
2949                 int d;
2950                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2951                 if(x<0) obmc_v += obmc[index + block_w];
2952                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2953                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2954                 //FIXME precalc this or simplify it somehow else
2955
2956                 d = -dst[index] + (1<<(FRAC_BITS-1));
2957                 dst[index] = d;
2958                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2959                 aa += obmc_v * obmc_v; //FIXME precalclate this
2960             }
2961         }
2962     }
2963     *b= backup;
2964
2965     return clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we shouldnt need cliping
2966 }
2967
2968 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2969     const int b_stride = s->b_width << s->block_max_depth;
2970     const int b_height = s->b_height<< s->block_max_depth;
2971     int index= x + y*b_stride;
2972     BlockNode *b     = &s->block[index];
2973     BlockNode *left  = x ? &s->block[index-1] : &null_block;
2974     BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2975     BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2976     BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2977     int dmx, dmy;
2978 //  int mx_context= av_log2(2*ABS(left->mx - top->mx));
2979 //  int my_context= av_log2(2*ABS(left->my - top->my));
2980
2981     if(x<0 || x>=b_stride || y>=b_height)
2982         return 0;
2983     dmx= b->mx - mid_pred(left->mx, top->mx, tr->mx);
2984     dmy= b->my - mid_pred(left->my, top->my, tr->my);
2985 /*
2986 1            0      0
2987 01X          1-2    1
2988 001XX        3-6    2-3
2989 0001XXX      7-14   4-7
2990 00001XXXX   15-30   8-15
2991 */
2992 //FIXME try accurate rate
2993 //FIXME intra and inter predictors if surrounding blocks arent the same type
2994     if(b->type & BLOCK_INTRA){
2995         return 3+2*( av_log2(2*ABS(left->color[0] - b->color[0]))
2996                    + av_log2(2*ABS(left->color[1] - b->color[1]))
2997                    + av_log2(2*ABS(left->color[2] - b->color[2])));
2998     }else
2999         return 2*(1 + av_log2(2*ABS(dmx))
3000                     + av_log2(2*ABS(dmy))); //FIXME kill the 2* can be merged in lambda
3001 }
3002
3003 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
3004     Plane *p= &s->plane[plane_index];
3005     const int block_size = MB_SIZE >> s->block_max_depth;
3006     const int block_w    = plane_index ? block_size/2 : block_size;
3007     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3008     const int obmc_stride= plane_index ? block_size : 2*block_size;
3009     const int ref_stride= s->current_picture.linesize[plane_index];
3010     uint8_t *ref= s->   last_picture.data[plane_index];
3011     uint8_t *dst= s->current_picture.data[plane_index];
3012     uint8_t *src= s->  input_picture.data[plane_index];
3013     DWTELEM *pred= (DWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
3014     uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
3015     uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
3016     const int b_stride = s->b_width << s->block_max_depth;
3017     const int b_height = s->b_height<< s->block_max_depth;
3018     const int w= p->width;
3019     const int h= p->height;
3020     int distortion;
3021     int rate= 0;
3022     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3023     int sx= block_w*mb_x - block_w/2;
3024     int sy= block_w*mb_y - block_w/2;
3025     int x0= FFMAX(0,-sx);
3026     int y0= FFMAX(0,-sy);
3027     int x1= FFMIN(block_w*2, w-sx);
3028     int y1= FFMIN(block_w*2, h-sy);
3029     int i,x,y;
3030
3031     pred_block(s, cur, ref, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
3032
3033     for(y=y0; y<y1; y++){
3034         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
3035         const DWTELEM *pred1 = pred + y*obmc_stride;
3036         uint8_t *cur1 = cur + y*ref_stride;
3037         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
3038         for(x=x0; x<x1; x++){
3039             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
3040             v = (v + pred1[x]) >> FRAC_BITS;
3041             if(v&(~255)) v= ~(v>>31);
3042             dst1[x] = v;
3043         }
3044     }
3045
3046     /* copy the regions where obmc[] = (uint8_t)256 */
3047     if(LOG2_OBMC_MAX == 8
3048         && (mb_x == 0 || mb_x == b_stride-1)
3049         && (mb_y == 0 || mb_y == b_height-1)){
3050         if(mb_x == 0)
3051             x1 = block_w;
3052         else
3053             x0 = block_w;
3054         if(mb_y == 0)
3055             y1 = block_w;
3056         else
3057             y0 = block_w;
3058         for(y=y0; y<y1; y++)
3059             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
3060     }
3061
3062     //FIXME sad/ssd can be broken up, but wavelet cmp should be one 32x32 block
3063     if(block_w==16){
3064         distortion = 0;
3065         for(i=0; i<4; i++){
3066             int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
3067             distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
3068         }
3069     }else{
3070         assert(block_w==8);
3071         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
3072     }
3073
3074     if(plane_index==0){
3075         for(i=0; i<4; i++){
3076 /* ..RRr
3077  * .RXx.
3078  * rxx..
3079  */
3080             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
3081         }
3082         if(mb_x == b_stride-2)
3083             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
3084     }
3085     return distortion + rate*penalty_factor;
3086 }
3087
3088 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
3089     int i, y2;
3090     Plane *p= &s->plane[plane_index];
3091     const int block_size = MB_SIZE >> s->block_max_depth;
3092     const int block_w    = plane_index ? block_size/2 : block_size;
3093     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
3094     const int obmc_stride= plane_index ? block_size : 2*block_size;
3095     const int ref_stride= s->current_picture.linesize[plane_index];
3096     uint8_t *ref= s->   last_picture.data[plane_index];
3097     uint8_t *dst= s->current_picture.data[plane_index];
3098     uint8_t *src= s-> input_picture.data[plane_index];
3099     const static DWTELEM zero_dst[4096]; //FIXME
3100     const int b_stride = s->b_width << s->block_max_depth;
3101     const int b_height = s->b_height<< s->block_max_depth;
3102     const int w= p->width;
3103     const int h= p->height;
3104     int distortion= 0;
3105     int rate= 0;
3106     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
3107
3108     for(i=0; i<9; i++){
3109         int mb_x2= mb_x + (i%3) - 1;
3110         int mb_y2= mb_y + (i/3) - 1;
3111         int x= block_w*mb_x2 + block_w/2;
3112         int y= block_w*mb_y2 + block_w/2;
3113
3114         add_yblock(s, zero_dst, dst, ref, obmc,
3115                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
3116
3117         //FIXME find a cleaner/simpler way to skip the outside stuff
3118         for(y2= y; y2<0; y2++)
3119             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3120         for(y2= h; y2<y+block_w; y2++)
3121             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
3122         if(x<0){
3123             for(y2= y; y2<y+block_w; y2++)
3124                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
3125         }
3126         if(x+block_w > w){
3127             for(y2= y; y2<y+block_w; y2++)
3128                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
3129         }
3130
3131         assert(block_w== 8 || block_w==16);
3132         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
3133     }
3134
3135     if(plane_index==0){
3136         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
3137         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
3138
3139 /* ..RRRr
3140  * .RXXx.
3141  * .RXXx.
3142  * rxxx.
3143  */
3144         if(merged)
3145             rate = get_block_bits(s, mb_x, mb_y, 2);
3146         for(i=merged?4:0; i<9; i++){
3147             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3148             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3149         }
3150     }
3151     return distortion + rate*penalty_factor;
3152 }
3153
3154 static always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3155     const int b_stride= s->b_width << s->block_max_depth;
3156     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3157     BlockNode backup= *block;
3158     int rd, index, value;
3159
3160     assert(mb_x>=0 && mb_y>=0);
3161     assert(mb_x<b_stride);
3162
3163     if(intra){
3164         block->color[0] = p[0];
3165         block->color[1] = p[1];
3166         block->color[2] = p[2];
3167         block->type |= BLOCK_INTRA;
3168     }else{
3169         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3170         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6);
3171         if(s->me_cache[index] == value)
3172             return 0;
3173         s->me_cache[index]= value;
3174
3175         block->mx= p[0];
3176         block->my= p[1];
3177         block->type &= ~BLOCK_INTRA;
3178     }
3179
3180     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3181
3182 //FIXME chroma
3183     if(rd < *best_rd){
3184         *best_rd= rd;
3185         return 1;
3186     }else{
3187         *block= backup;
3188         return 0;
3189     }
3190 }
3191
3192 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3193 static always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3194     int p[2] = {p0, p1};
3195     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3196 }
3197
3198 static always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int *best_rd){
3199     const int b_stride= s->b_width << s->block_max_depth;
3200     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3201     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3202     int rd, index, value;
3203
3204     assert(mb_x>=0 && mb_y>=0);
3205     assert(mb_x<b_stride);
3206     assert(((mb_x|mb_y)&1) == 0);
3207
3208     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3209     value= s->me_cache_generation + (p0>>10) + (p1<<6);
3210     if(s->me_cache[index] == value)
3211         return 0;
3212     s->me_cache[index]= value;
3213
3214     block->mx= p0;
3215     block->my= p1;
3216     block->type &= ~BLOCK_INTRA;
3217     block[1]= block[b_stride]= block[b_stride+1]= *block;
3218
3219     rd= get_4block_rd(s, mb_x, mb_y, 0);
3220
3221 //FIXME chroma
3222     if(rd < *best_rd){
3223         *best_rd= rd;
3224         return 1;
3225     }else{
3226         block[0]= backup[0];
3227         block[1]= backup[1];
3228         block[b_stride]= backup[2];
3229         block[b_stride+1]= backup[3];
3230         return 0;
3231     }
3232 }
3233
3234 static void iterative_me(SnowContext *s){
3235     int pass, mb_x, mb_y;
3236     const int b_width = s->b_width  << s->block_max_depth;
3237     const int b_height= s->b_height << s->block_max_depth;
3238     const int b_stride= b_width;
3239     int color[3];
3240
3241     {
3242         RangeCoder r = s->c;
3243         uint8_t state[sizeof(s->block_state)];
3244         memcpy(state, s->block_state, sizeof(s->block_state));
3245         for(mb_y= 0; mb_y<s->b_height; mb_y++)
3246             for(mb_x= 0; mb_x<s->b_width; mb_x++)
3247                 encode_q_branch(s, 0, mb_x, mb_y);
3248         s->c = r;
3249         memcpy(s->block_state, state, sizeof(s->block_state));
3250     }
3251
3252     for(pass=0; pass<50; pass++){
3253         int change= 0;
3254
3255         for(mb_y= 0; mb_y<b_height; mb_y++){
3256             for(mb_x= 0; mb_x<b_width; mb_x++){
3257                 int dia_change, i, j;
3258                 int best_rd= INT_MAX;
3259                 BlockNode backup;
3260                 const int index= mb_x + mb_y * b_stride;
3261                 BlockNode *block= &s->block[index];
3262                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : &null_block;
3263                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : &null_block;
3264                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : &null_block;
3265                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : &null_block;
3266                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : &null_block;
3267                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : &null_block;
3268                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : &null_block;
3269                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : &null_block;
3270                 const int b_w= (MB_SIZE >> s->block_max_depth);
3271                 uint8_t obmc_edged[b_w*2][b_w*2];
3272
3273                 if(pass && (block->type & BLOCK_OPT))
3274                     continue;
3275                 block->type |= BLOCK_OPT;
3276
3277                 backup= *block;
3278
3279                 if(!s->me_cache_generation)
3280                     memset(s->me_cache, 0, sizeof(s->me_cache));
3281                 s->me_cache_generation += 1<<22;
3282
3283                 //FIXME precalc
3284                 {
3285                     int x, y;
3286                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3287                     if(mb_x==0)
3288                         for(y=0; y<b_w*2; y++)
3289                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3290                     if(mb_x==b_stride-1)
3291                         for(y=0; y<b_w*2; y++)
3292                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3293                     if(mb_y==0){
3294                         for(x=0; x<b_w*2; x++)
3295                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
3296                         for(y=1; y<b_w; y++)
3297                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3298                     }
3299                     if(mb_y==b_height-1){
3300                         for(x=0; x<b_w*2; x++)
3301                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3302                         for(y=b_w; y<b_w*2-1; y++)
3303                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3304                     }
3305                 }
3306
3307                 //skip stuff outside the picture
3308                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3309                 {
3310                     uint8_t *src= s->  input_picture.data[0];
3311                     uint8_t *dst= s->current_picture.data[0];
3312                     const int stride= s->current_picture.linesize[0];
3313                     const int block_w= MB_SIZE >> s->block_max_depth;
3314                     const int sx= block_w*mb_x - block_w/2;
3315                     const int sy= block_w*mb_y - block_w/2;
3316                     const int w= s->plane[0].width;
3317                     const int h= s->plane[0].height;
3318                     int y;
3319
3320                     for(y=sy; y<0; y++)
3321                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3322                     for(y=h; y<sy+block_w*2; y++)
3323                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3324                     if(sx<0){
3325                         for(y=sy; y<sy+block_w*2; y++)
3326                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3327                     }
3328                     if(sx+block_w*2 > w){
3329                         for(y=sy; y<sy+block_w*2; y++)
3330                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3331                     }
3332                 }
3333
3334                 // intra(black) = neighbors' contribution to the current block
3335                 for(i=0; i<3; i++)
3336                     color[i]= get_dc(s, mb_x, mb_y, i);
3337
3338                 // get previous score (cant be cached due to OBMC)
3339                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3340                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3341                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3342                 }else
3343                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3344
3345                 check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3346                 check_block_inter(s, mb_x, mb_y, tb->mx, tb->my, *obmc_edged, &best_rd);
3347                 check_block_inter(s, mb_x, mb_y, lb->mx, lb->my, *obmc_edged, &best_rd);
3348                 check_block_inter(s, mb_x, mb_y, rb->mx, rb->my, *obmc_edged, &best_rd);
3349                 check_block_inter(s, mb_x, mb_y, bb->mx, bb->my, *obmc_edged, &best_rd);
3350
3351                 /* fullpel ME */
3352                 //FIXME avoid subpel interpol / round to nearest integer
3353                 do{
3354                     dia_change=0;
3355                     for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3356                         for(j=0; j<i; j++){
3357                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3358                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3359                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3360                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3361                         }
3362                     }
3363                 }while(dia_change);
3364                 /* subpel ME */
3365                 do{
3366                     static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3367                     dia_change=0;
3368                     for(i=0; i<8; i++)
3369                         dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3370                 }while(dia_change);
3371                 //FIXME or try the standard 2 pass qpel or similar
3372 #if 1
3373                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3374                 //FIXME RD style color selection
3375 #endif
3376                 if(!same_block(block, &backup)){
3377                     if(tb != &null_block) tb ->type &= ~BLOCK_OPT;
3378                     if(lb != &null_block) lb ->type &= ~BLOCK_OPT;
3379                     if(rb != &null_block) rb ->type &= ~BLOCK_OPT;
3380                     if(bb != &null_block) bb ->type &= ~BLOCK_OPT;
3381                     if(tlb!= &null_block) tlb->type &= ~BLOCK_OPT;
3382                     if(trb!= &null_block) trb->type &= ~BLOCK_OPT;
3383                     if(blb!= &null_block) blb->type &= ~BLOCK_OPT;
3384                     if(brb!= &null_block) brb->type &= ~BLOCK_OPT;
3385                     change ++;
3386                 }
3387             }
3388         }
3389         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3390         if(!change)
3391             break;
3392     }
3393
3394     if(s->block_max_depth == 1){
3395         int change= 0;
3396         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3397             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3398                 int dia_change, i, j;
3399                 int best_rd, init_rd;
3400                 const int index= mb_x + mb_y * b_stride;
3401                 BlockNode *b[4];
3402
3403                 b[0]= &s->block[index];
3404                 b[1]= b[0]+1;
3405                 b[2]= b[0]+b_stride;
3406                 b[3]= b[2]+1;
3407                 if(same_block(b[0], b[1]) &&
3408                    same_block(b[0], b[2]) &&
3409                    same_block(b[0], b[3]))
3410                     continue;
3411
3412                 if(!s->me_cache_generation)
3413                     memset(s->me_cache, 0, sizeof(s->me_cache));
3414                 s->me_cache_generation += 1<<22;
3415
3416                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3417
3418                 check_4block_inter(s, mb_x, mb_y,
3419                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3420                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, &best_rd);
3421
3422                 for(i=0; i<4; i++)
3423                     if(!(b[i]->type&BLOCK_INTRA))
3424                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, &best_rd);
3425
3426                 if(init_rd != best_rd)
3427                     change++;
3428             }
3429         }
3430         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3431     }
3432 }
3433
3434 static void quantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int bias){
3435     const int level= b->level;
3436     const int w= b->width;
3437     const int h= b->height;
3438     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3439     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3440     int x,y, thres1, thres2;
3441 //    START_TIMER
3442
3443     if(s->qlog == LOSSLESS_QLOG) return;
3444
3445     bias= bias ? 0 : (3*qmul)>>3;
3446     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3447     thres2= 2*thres1;
3448
3449     if(!bias){
3450         for(y=0; y<h; y++){
3451             for(x=0; x<w; x++){
3452                 int i= src[x + y*stride];
3453
3454                 if((unsigned)(i+thres1) > thres2){
3455                     if(i>=0){
3456                         i<<= QEXPSHIFT;
3457                         i/= qmul; //FIXME optimize
3458                         src[x + y*stride]=  i;
3459                     }else{
3460                         i= -i;
3461                         i<<= QEXPSHIFT;
3462                         i/= qmul; //FIXME optimize
3463                         src[x + y*stride]= -i;
3464                     }
3465                 }else
3466                     src[x + y*stride]= 0;
3467             }
3468         }
3469     }else{
3470         for(y=0; y<h; y++){
3471             for(x=0; x<w; x++){
3472                 int i= src[x + y*stride];
3473
3474                 if((unsigned)(i+thres1) > thres2){
3475                     if(i>=0){
3476                         i<<= QEXPSHIFT;
3477                         i= (i + bias) / qmul; //FIXME optimize
3478                         src[x + y*stride]=  i;
3479                     }else{
3480                         i= -i;
3481                         i<<= QEXPSHIFT;
3482                         i= (i + bias) / qmul; //FIXME optimize
3483                         src[x + y*stride]= -i;
3484                     }
3485                 }else
3486                     src[x + y*stride]= 0;
3487             }
3488         }
3489     }
3490     if(level+1 == s->spatial_decomposition_count){
3491 //        STOP_TIMER("quantize")
3492     }
3493 }
3494
3495 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int start_y, int end_y){
3496     const int w= b->width;
3497     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3498     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3499     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3500     int x,y;
3501     START_TIMER
3502
3503     if(s->qlog == LOSSLESS_QLOG) return;
3504
3505     for(y=start_y; y<end_y; y++){
3506 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3507         DWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3508         for(x=0; x<w; x++){
3509             int i= line[x];
3510             if(i<0){
3511                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3512             }else if(i>0){
3513                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3514             }
3515         }
3516     }
3517     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3518         STOP_TIMER("dquant")
3519     }
3520 }
3521
3522 static void dequantize(SnowContext *s, SubBand *b, DWTELEM *src, int stride){
3523     const int w= b->width;
3524     const int h= b->height;
3525     const int qlog= clip(s->qlog + b->qlog, 0, QROOT*16);
3526     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3527     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3528     int x,y;
3529     START_TIMER
3530
3531     if(s->qlog == LOSSLESS_QLOG) return;
3532
3533     for(y=0; y<h; y++){
3534         for(x=0; x<w; x++){
3535             int i= src[x + y*stride];
3536             if(i<0){
3537                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3538             }else if(i>0){
3539                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3540             }
3541         }
3542     }
3543     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3544         STOP_TIMER("dquant")
3545     }
3546 }
3547
3548 static void decorrelate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3549     const int w= b->width;
3550     const int h= b->height;
3551     int x,y;
3552
3553     for(y=h-1; y>=0; y--){
3554         for(x=w-1; x>=0; x--){
3555             int i= x + y*stride;
3556
3557             if(x){
3558                 if(use_median){
3559                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3560                     else  src[i] -= src[i - 1];
3561                 }else{
3562                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3563                     else  src[i] -= src[i - 1];
3564                 }
3565             }else{
3566                 if(y) src[i] -= src[i - stride];
3567             }
3568         }
3569     }
3570 }
3571
3572 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3573     const int w= b->width;
3574     int x,y;
3575
3576 //    START_TIMER
3577
3578     DWTELEM * line;
3579     DWTELEM * prev;
3580
3581     if (start_y != 0)
3582         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3583
3584     for(y=start_y; y<end_y; y++){
3585         prev = line;
3586 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3587         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3588         for(x=0; x<w; x++){
3589             if(x){
3590                 if(use_median){
3591                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3592                     else  line[x] += line[x - 1];
3593                 }else{
3594                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3595                     else  line[x] += line[x - 1];
3596                 }
3597             }else{
3598                 if(y) line[x] += prev[x];
3599             }
3600         }
3601     }
3602
3603 //    STOP_TIMER("correlate")
3604 }
3605
3606 static void correlate(SnowContext *s, SubBand *b, DWTELEM *src, int stride, int inverse, int use_median){
3607     const int w= b->width;
3608     const int h= b->height;
3609     int x,y;
3610
3611     for(y=0; y<h; y++){
3612         for(x=0; x<w; x++){
3613             int i= x + y*stride;
3614
3615             if(x){
3616                 if(use_median){
3617                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3618                     else  src[i] += src[i - 1];
3619                 }else{
3620                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3621                     else  src[i] += src[i - 1];
3622                 }
3623             }else{
3624                 if(y) src[i] += src[i - stride];
3625             }
3626         }
3627     }
3628 }
3629
3630 static void encode_header(SnowContext *s){
3631     int plane_index, level, orientation;
3632     uint8_t kstate[32];
3633
3634     memset(kstate, MID_STATE, sizeof(kstate));
3635
3636     put_rac(&s->c, kstate, s->keyframe);
3637     if(s->keyframe || s->always_reset)
3638         reset_contexts(s);
3639     if(s->keyframe){
3640         put_symbol(&s->c, s->header_state, s->version, 0);
3641         put_rac(&s->c, s->header_state, s->always_reset);
3642         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3643         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3644         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3645         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3646         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3647         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3648         put_rac(&s->c, s->header_state, s->spatial_scalability);
3649 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3650
3651         for(plane_index=0; plane_index<2; plane_index++){
3652             for(level=0; level<s->spatial_decomposition_count; level++){
3653                 for(orientation=level ? 1:0; orientation<4; orientation++){
3654                     if(orientation==2) continue;
3655                     put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3656                 }
3657             }
3658         }
3659     }
3660     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type, 0);
3661     put_symbol(&s->c, s->header_state, s->qlog, 1);
3662     put_symbol(&s->c, s->header_state, s->mv_scale, 0);
3663     put_symbol(&s->c, s->header_state, s->qbias, 1);
3664     put_symbol(&s->c, s->header_state, s->block_max_depth, 0);
3665 }
3666
3667 static int decode_header(SnowContext *s){
3668     int plane_index, level, orientation;
3669     uint8_t kstate[32];
3670
3671     memset(kstate, MID_STATE, sizeof(kstate));
3672
3673     s->keyframe= get_rac(&s->c, kstate);
3674     if(s->keyframe || s->always_reset)
3675         reset_contexts(s);
3676     if(s->keyframe){
3677         s->version= get_symbol(&s->c, s->header_state, 0);
3678         if(s->version>0){
3679             av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3680             return -1;
3681         }
3682         s->always_reset= get_rac(&s->c, s->header_state);
3683         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3684         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3685         s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3686         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3687         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3688         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3689         s->spatial_scalability= get_rac(&s->c, s->header_state);
3690 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3691
3692         for(plane_index=0; plane_index<3; plane_index++){
3693             for(level=0; level<s->spatial_decomposition_count; level++){
3694                 for(orientation=level ? 1:0; orientation<4; orientation++){
3695                     int q;
3696                     if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3697                     else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3698                     else                    q= get_symbol(&s->c, s->header_state, 1);
3699                     s->plane[plane_index].band[level][orientation].qlog= q;
3700                 }
3701             }
3702         }
3703     }
3704
3705     s->spatial_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3706     if(s->spatial_decomposition_type > 2){
3707         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3708         return -1;
3709     }
3710
3711     s->qlog= get_symbol(&s->c, s->header_state, 1);
3712     s->mv_scale= get_symbol(&s->c, s->header_state, 0);
3713     s->qbias= get_symbol(&s->c, s->header_state, 1);
3714     s->block_max_depth= get_symbol(&s->c, s->header_state, 0);
3715     if(s->block_max_depth > 1){
3716         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3717         s->block_max_depth= 0;
3718         return -1;
3719     }
3720
3721     return 0;
3722 }
3723
3724 static void init_qexp(void){
3725     int i;
3726     double v=128;
3727
3728     for(i=0; i<QROOT; i++){
3729         qexp[i]= lrintf(v);
3730         v *= pow(2, 1.0 / QROOT);
3731     }
3732 }
3733
3734 static int common_init(AVCodecContext *avctx){
3735     SnowContext *s = avctx->priv_data;
3736     int width, height;
3737     int level, orientation, plane_index, dec;
3738
3739     s->avctx= avctx;
3740
3741     dsputil_init(&s->dsp, avctx);
3742
3743 #define mcf(dx,dy)\
3744     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3745     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3746         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3747     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3748     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3749         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3750
3751     mcf( 0, 0)
3752     mcf( 4, 0)
3753     mcf( 8, 0)
3754     mcf(12, 0)
3755     mcf( 0, 4)
3756     mcf( 4, 4)
3757     mcf( 8, 4)
3758     mcf(12, 4)
3759     mcf( 0, 8)
3760     mcf( 4, 8)
3761     mcf( 8, 8)
3762     mcf(12, 8)
3763     mcf( 0,12)
3764     mcf( 4,12)
3765     mcf( 8,12)
3766     mcf(12,12)
3767
3768 #define mcfh(dx,dy)\
3769     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3770     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3771         mc_block_hpel ## dx ## dy ## 16;\
3772     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3773     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3774         mc_block_hpel ## dx ## dy ## 8;
3775
3776     mcfh(0, 0)
3777     mcfh(8, 0)
3778     mcfh(0, 8)
3779     mcfh(8, 8)
3780
3781     if(!qexp[0])
3782         init_qexp();
3783
3784     dec= s->spatial_decomposition_count= 5;
3785     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3786
3787     s->chroma_h_shift= 1; //FIXME XXX
3788     s->chroma_v_shift= 1;
3789
3790 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3791
3792     width= s->avctx->width;
3793     height= s->avctx->height;
3794
3795     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM));
3796
3797     s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3798     s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3799
3800     for(plane_index=0; plane_index<3; plane_index++){
3801         int w= s->avctx->width;
3802         int h= s->avctx->height;
3803
3804         if(plane_index){
3805             w>>= s->chroma_h_shift;
3806             h>>= s->chroma_v_shift;
3807         }
3808         s->plane[plane_index].width = w;
3809         s->plane[plane_index].height= h;
3810 //av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3811         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3812             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3813                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3814
3815                 b->buf= s->spatial_dwt_buffer;
3816                 b->level= level;
3817                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3818                 b->width = (w + !(orientation&1))>>1;
3819                 b->height= (h + !(orientation>1))>>1;
3820
3821                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3822                 b->buf_x_offset = 0;
3823                 b->buf_y_offset = 0;
3824
3825                 if(orientation&1){
3826                     b->buf += (w+1)>>1;
3827                     b->buf_x_offset = (w+1)>>1;
3828                 }
3829                 if(orientation>1){
3830                     b->buf += b->stride>>1;
3831                     b->buf_y_offset = b->stride_line >> 1;
3832                 }
3833
3834                 if(level)
3835                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3836                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3837             }
3838             w= (w+1)>>1;
3839             h= (h+1)>>1;
3840         }
3841     }
3842
3843     reset_contexts(s);
3844 /*
3845     width= s->width= avctx->width;
3846     height= s->height= avctx->height;
3847
3848     assert(width && height);
3849 */
3850     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3851
3852     return 0;
3853 }
3854
3855
3856 static void calculate_vissual_weight(SnowContext *s, Plane *p){
3857     int width = p->width;
3858     int height= p->height;
3859     int level, orientation, x, y;
3860
3861     for(level=0; level<s->spatial_decomposition_count; level++){
3862         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3863             SubBand *b= &p->band[level][orientation];
3864             DWTELEM *buf= b->buf;
3865             int64_t error=0;
3866
3867             memset(s->spatial_dwt_buffer, 0, sizeof(int)*width*height);
3868             buf[b->width/2 + b->height/2*b->stride]= 256*256;
3869             ff_spatial_idwt(s->spatial_dwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3870             for(y=0; y<height; y++){
3871                 for(x=0; x<width; x++){
3872                     int64_t d= s->spatial_dwt_buffer[x + y*width];
3873                     error += d*d;
3874                 }
3875             }
3876
3877             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3878 //            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3879         }
3880     }
3881 }
3882
3883 static int encode_init(AVCodecContext *avctx)
3884 {
3885     SnowContext *s = avctx->priv_data;
3886     int plane_index;
3887
3888     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3889         av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3890                "use vstrict=-2 / -strict -2 to use it anyway\n");
3891         return -1;
3892     }
3893
3894     common_init(avctx);
3895     alloc_blocks(s);
3896
3897     s->version=0;
3898
3899     s->m.avctx   = avctx;
3900     s->m.flags   = avctx->flags;
3901     s->m.bit_rate= avctx->bit_rate;
3902
3903     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3904     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3905     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3906     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3907     h263_encode_init(&s->m); //mv_penalty
3908
3909     if(avctx->flags&CODEC_FLAG_PASS1){
3910         if(!avctx->stats_out)
3911             avctx->stats_out = av_mallocz(256);
3912     }
3913     if(avctx->flags&CODEC_FLAG_PASS2){
3914         if(ff_rate_control_init(&s->m) < 0)
3915             return -1;
3916     }
3917
3918     for(plane_index=0; plane_index<3; plane_index++){
3919         calculate_vissual_weight(s, &s->plane[plane_index]);
3920     }
3921
3922
3923     avctx->coded_frame= &s->current_picture;
3924     switch(avctx->pix_fmt){
3925 //    case PIX_FMT_YUV444P:
3926 //    case PIX_FMT_YUV422P:
3927     case PIX_FMT_YUV420P:
3928     case PIX_FMT_GRAY8:
3929 //    case PIX_FMT_YUV411P:
3930 //    case PIX_FMT_YUV410P:
3931         s->colorspace_type= 0;
3932         break;
3933 /*    case PIX_FMT_RGBA32:
3934         s->colorspace= 1;
3935         break;*/
3936     default:
3937         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3938         return -1;
3939     }
3940 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
3941     s->chroma_h_shift= 1;
3942     s->chroma_v_shift= 1;
3943
3944     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
3945     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
3946
3947     s->avctx->get_buffer(s->avctx, &s->input_picture);
3948
3949     return 0;
3950 }
3951
3952 static int frame_start(SnowContext *s){
3953    AVFrame tmp;
3954    int w= s->avctx->width; //FIXME round up to x16 ?
3955    int h= s->avctx->height;
3956
3957     if(s->current_picture.data[0]){
3958         draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
3959         draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
3960         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
3961     }
3962
3963     tmp= s->last_picture;
3964     s->last_picture= s->current_picture;
3965     s->current_picture= tmp;
3966
3967     s->current_picture.reference= 1;
3968     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
3969         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
3970         return -1;
3971     }
3972
3973     return 0;
3974 }
3975
3976 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
3977     SnowContext *s = avctx->priv_data;
3978     RangeCoder * const c= &s->c;
3979     AVFrame *pict = data;
3980     const int width= s->avctx->width;
3981     const int height= s->avctx->height;
3982     int level, orientation, plane_index, i, y;
3983
3984     ff_init_range_encoder(c, buf, buf_size);
3985     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
3986
3987     for(i=0; i<3; i++){
3988         int shift= !!i;
3989         for(y=0; y<(height>>shift); y++)
3990             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
3991                    &pict->data[i][y * pict->linesize[i]],
3992                    width>>shift);
3993     }
3994     s->new_picture = *pict;
3995
3996     if(avctx->flags&CODEC_FLAG_PASS2){
3997         s->m.pict_type =
3998         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
3999         s->keyframe= pict->pict_type==FF_I_TYPE;
4000         s->m.picture_number= avctx->frame_number;
4001         if(!(avctx->flags&CODEC_FLAG_QSCALE))
4002             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4003     }else{
4004         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4005         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4006     }
4007
4008     if(pict->quality){
4009         s->qlog= rint(QROOT*log(pict->quality / (float)FF_QP2LAMBDA)/log(2));
4010         //<64 >60
4011         s->qlog += 61*QROOT/8;
4012     }else{
4013         s->qlog= LOSSLESS_QLOG;
4014     }
4015
4016     frame_start(s);
4017     s->current_picture.key_frame= s->keyframe;
4018
4019     s->m.current_picture_ptr= &s->m.current_picture;
4020     if(pict->pict_type == P_TYPE){
4021         int block_width = (width +15)>>4;
4022         int block_height= (height+15)>>4;
4023         int stride= s->current_picture.linesize[0];
4024
4025         assert(s->current_picture.data[0]);
4026         assert(s->last_picture.data[0]);
4027
4028         s->m.avctx= s->avctx;
4029         s->m.current_picture.data[0]= s->current_picture.data[0];
4030         s->m.   last_picture.data[0]= s->   last_picture.data[0];
4031         s->m.    new_picture.data[0]= s->  input_picture.data[0];
4032         s->m.   last_picture_ptr= &s->m.   last_picture;
4033         s->m.linesize=
4034         s->m.   last_picture.linesize[0]=
4035         s->m.    new_picture.linesize[0]=
4036         s->m.current_picture.linesize[0]= stride;
4037         s->m.uvlinesize= s->current_picture.linesize[1];
4038         s->m.width = width;
4039         s->m.height= height;
4040         s->m.mb_width = block_width;
4041         s->m.mb_height= block_height;
4042         s->m.mb_stride=   s->m.mb_width+1;
4043         s->m.b8_stride= 2*s->m.mb_width+1;
4044         s->m.f_code=1;
4045         s->m.pict_type= pict->pict_type;
4046         s->m.me_method= s->avctx->me_method;
4047         s->m.me.scene_change_score=0;
4048         s->m.flags= s->avctx->flags;
4049         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
4050         s->m.out_format= FMT_H263;
4051         s->m.unrestricted_mv= 1;
4052
4053         s->lambda = s->m.lambda= pict->quality * 3/2; //FIXME bug somewhere else
4054         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
4055         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
4056
4057         s->m.dsp= s->dsp; //move
4058         ff_init_me(&s->m);
4059         s->dsp= s->m.dsp;
4060     }
4061
4062 redo_frame:
4063
4064     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
4065
4066     encode_header(s);
4067     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4068     encode_blocks(s);
4069     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
4070
4071     for(plane_index=0; plane_index<3; plane_index++){
4072         Plane *p= &s->plane[plane_index];
4073         int w= p->width;
4074         int h= p->height;
4075         int x, y;
4076 //        int bits= put_bits_count(&s->c.pb);
4077
4078         //FIXME optimize
4079      if(pict->data[plane_index]) //FIXME gray hack
4080         for(y=0; y<h; y++){
4081             for(x=0; x<w; x++){
4082                 s->spatial_dwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
4083             }
4084         }
4085         predict_plane(s, s->spatial_dwt_buffer, plane_index, 0);
4086
4087         if(   plane_index==0
4088            && pict->pict_type == P_TYPE
4089            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
4090             ff_init_range_encoder(c, buf, buf_size);
4091             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4092             pict->pict_type= FF_I_TYPE;
4093             s->keyframe=1;
4094             reset_contexts(s);
4095             goto redo_frame;
4096         }
4097
4098         if(s->qlog == LOSSLESS_QLOG){
4099             for(y=0; y<h; y++){
4100                 for(x=0; x<w; x++){
4101                     s->spatial_dwt_buffer[y*w + x]= (s->spatial_dwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
4102                 }
4103             }
4104         }
4105
4106         ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4107
4108         for(level=0; level<s->spatial_decomposition_count; level++){
4109             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4110                 SubBand *b= &p->band[level][orientation];
4111
4112                 quantize(s, b, b->buf, b->stride, s->qbias);
4113                 if(orientation==0)
4114                     decorrelate(s, b, b->buf, b->stride, pict->pict_type == P_TYPE, 0);
4115                 encode_subband(s, b, b->buf, b->parent ? b->parent->buf : NULL, b->stride, orientation);
4116                 assert(b->parent==NULL || b->parent->stride == b->stride*2);
4117                 if(orientation==0)
4118                     correlate(s, b, b->buf, b->stride, 1, 0);
4119             }
4120         }
4121 //        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
4122
4123         for(level=0; level<s->spatial_decomposition_count; level++){
4124             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4125                 SubBand *b= &p->band[level][orientation];
4126
4127                 dequantize(s, b, b->buf, b->stride);
4128             }
4129         }
4130
4131         ff_spatial_idwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4132         if(s->qlog == LOSSLESS_QLOG){
4133             for(y=0; y<h; y++){
4134                 for(x=0; x<w; x++){
4135                     s->spatial_dwt_buffer[y*w + x]<<=FRAC_BITS;
4136                 }
4137             }
4138         }
4139 {START_TIMER
4140         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4141 STOP_TIMER("pred-conv")}
4142         if(s->avctx->flags&CODEC_FLAG_PSNR){
4143             int64_t error= 0;
4144
4145     if(pict->data[plane_index]) //FIXME gray hack
4146             for(y=0; y<h; y++){
4147                 for(x=0; x<w; x++){
4148                     int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4149                     error += d*d;
4150                 }
4151             }
4152             s->avctx->error[plane_index] += error;
4153             s->current_picture.error[plane_index] = error;
4154         }
4155     }
4156
4157     if(s->last_picture.data[0])
4158         avctx->release_buffer(avctx, &s->last_picture);
4159
4160     s->current_picture.coded_picture_number = avctx->frame_number;
4161     s->current_picture.pict_type = pict->pict_type;
4162     s->current_picture.quality = pict->quality;
4163     if(avctx->flags&CODEC_FLAG_PASS1){
4164         s->m.p_tex_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits - s->m.mv_bits;
4165         s->m.current_picture.display_picture_number =
4166         s->m.current_picture.coded_picture_number = avctx->frame_number;
4167         s->m.pict_type = pict->pict_type;
4168         s->m.current_picture.quality = pict->quality;
4169         ff_write_pass1_stats(&s->m);
4170     }
4171     if(avctx->flags&CODEC_FLAG_PASS2){
4172         s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4173     }
4174
4175     emms_c();
4176
4177     return ff_rac_terminate(c);
4178 }
4179
4180 static void common_end(SnowContext *s){
4181     int plane_index, level, orientation;
4182
4183     av_freep(&s->spatial_dwt_buffer);
4184
4185     av_freep(&s->m.me.scratchpad);
4186     av_freep(&s->m.me.map);
4187     av_freep(&s->m.me.score_map);
4188     av_freep(&s->m.obmc_scratchpad);
4189
4190     av_freep(&s->block);
4191
4192     for(plane_index=0; plane_index<3; plane_index++){
4193         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4194             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4195                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4196
4197                 av_freep(&b->x_coeff);
4198             }
4199         }
4200     }
4201 }
4202
4203 static int encode_end(AVCodecContext *avctx)
4204 {
4205     SnowContext *s = avctx->priv_data;
4206
4207     common_end(s);
4208     av_free(avctx->stats_out);
4209
4210     return 0;
4211 }
4212
4213 static int decode_init(AVCodecContext *avctx)
4214 {
4215     SnowContext *s = avctx->priv_data;
4216     int block_size;
4217
4218     avctx->pix_fmt= PIX_FMT_YUV420P;
4219
4220     common_init(avctx);
4221
4222     block_size = MB_SIZE >> s->block_max_depth;
4223     slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 3)) + 1, s->plane[0].width, s->spatial_dwt_buffer);
4224
4225     return 0;
4226 }
4227
4228 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
4229     SnowContext *s = avctx->priv_data;
4230     RangeCoder * const c= &s->c;
4231     int bytes_read;
4232     AVFrame *picture = data;
4233     int level, orientation, plane_index;
4234
4235     ff_init_range_decoder(c, buf, buf_size);
4236     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4237
4238     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4239     decode_header(s);
4240     if(!s->block) alloc_blocks(s);
4241
4242     frame_start(s);
4243     //keyframe flag dupliaction mess FIXME
4244     if(avctx->debug&FF_DEBUG_PICT_INFO)
4245         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4246
4247     decode_blocks(s);
4248
4249     for(plane_index=0; plane_index<3; plane_index++){
4250         Plane *p= &s->plane[plane_index];
4251         int w= p->width;
4252         int h= p->height;
4253         int x, y;
4254         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4255
4256 if(s->avctx->debug&2048){
4257         memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4258         predict_plane(s, s->spatial_dwt_buffer, plane_index, 1);
4259
4260         for(y=0; y<h; y++){
4261             for(x=0; x<w; x++){
4262                 int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4263                 s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4264             }
4265         }
4266 }
4267
4268 {   START_TIMER
4269     for(level=0; level<s->spatial_decomposition_count; level++){
4270         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4271             SubBand *b= &p->band[level][orientation];
4272             unpack_coeffs(s, b, b->parent, orientation);
4273         }
4274     }
4275     STOP_TIMER("unpack coeffs");
4276 }
4277
4278 {START_TIMER
4279     const int mb_h= s->b_height << s->block_max_depth;
4280     const int block_size = MB_SIZE >> s->block_max_depth;
4281     const int block_w    = plane_index ? block_size/2 : block_size;
4282     int mb_y;
4283     dwt_compose_t cs[MAX_DECOMPOSITIONS];
4284     int yd=0, yq=0;
4285     int y;
4286     int end_y;
4287
4288     ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4289     for(mb_y=0; mb_y<=mb_h; mb_y++){
4290
4291         int slice_starty = block_w*mb_y;
4292         int slice_h = block_w*(mb_y+1);
4293         if (!(s->keyframe || s->avctx->debug&512)){
4294             slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4295             slice_h -= (block_w >> 1);
4296         }
4297
4298         {
4299         START_TIMER
4300         for(level=0; level<s->spatial_decomposition_count; level++){
4301             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4302                 SubBand *b= &p->band[level][orientation];
4303                 int start_y;
4304                 int end_y;
4305                 int our_mb_start = mb_y;
4306                 int our_mb_end = (mb_y + 1);
4307                 const int extra= 3;
4308                 start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4309                 end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4310                 if (!(s->keyframe || s->avctx->debug&512)){
4311                     start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4312                     end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4313                 }
4314                 start_y = FFMIN(b->height, start_y);
4315                 end_y = FFMIN(b->height, end_y);
4316
4317                 if (start_y != end_y){
4318                     if (orientation == 0){
4319                         SubBand * correlate_band = &p->band[0][0];
4320                         int correlate_end_y = FFMIN(b->height, end_y + 1);
4321                         int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4322                         decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4323                         correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4324                         dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->buf, correlate_band->stride, start_y, end_y);
4325                     }
4326                     else
4327                         decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4328                 }
4329             }
4330         }
4331         STOP_TIMER("decode_subband_slice");
4332         }
4333
4334 {   START_TIMER
4335         for(; yd<slice_h; yd+=4){
4336             ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4337         }
4338     STOP_TIMER("idwt slice");}
4339
4340
4341         if(s->qlog == LOSSLESS_QLOG){
4342             for(; yq<slice_h && yq<h; yq++){
4343                 DWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4344                 for(x=0; x<w; x++){
4345                     line[x] <<= FRAC_BITS;
4346                 }
4347             }
4348         }
4349
4350         predict_slice_buffered(s, &s->sb, s->spatial_dwt_buffer, plane_index, 1, mb_y);
4351
4352         y = FFMIN(p->height, slice_starty);
4353         end_y = FFMIN(p->height, slice_h);
4354         while(y < end_y)
4355             slice_buffer_release(&s->sb, y++);
4356     }
4357
4358     slice_buffer_flush(&s->sb);
4359
4360 STOP_TIMER("idwt + predict_slices")}
4361     }
4362
4363     emms_c();
4364
4365     if(s->last_picture.data[0])
4366         avctx->release_buffer(avctx, &s->last_picture);
4367
4368 if(!(s->avctx->debug&2048))
4369     *picture= s->current_picture;
4370 else
4371     *picture= s->mconly_picture;
4372
4373     *data_size = sizeof(AVFrame);
4374
4375     bytes_read= c->bytestream - c->bytestream_start;
4376     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4377
4378     return bytes_read;
4379 }
4380
4381 static int decode_end(AVCodecContext *avctx)
4382 {
4383     SnowContext *s = avctx->priv_data;
4384
4385     slice_buffer_destroy(&s->sb);
4386
4387     common_end(s);
4388
4389     return 0;
4390 }
4391
4392 AVCodec snow_decoder = {
4393     "snow",
4394     CODEC_TYPE_VIDEO,
4395     CODEC_ID_SNOW,
4396     sizeof(SnowContext),
4397     decode_init,
4398     NULL,
4399     decode_end,
4400     decode_frame,
4401     0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4402     NULL
4403 };
4404
4405 #ifdef CONFIG_ENCODERS
4406 AVCodec snow_encoder = {
4407     "snow",
4408     CODEC_TYPE_VIDEO,
4409     CODEC_ID_SNOW,
4410     sizeof(SnowContext),
4411     encode_init,
4412     encode_frame,
4413     encode_end,
4414 };
4415 #endif
4416
4417
4418 #if 0
4419 #undef malloc
4420 #undef free
4421 #undef printf
4422
4423 int main(){
4424     int width=256;
4425     int height=256;
4426     int buffer[2][width*height];
4427     SnowContext s;
4428     int i;
4429     s.spatial_decomposition_count=6;
4430     s.spatial_decomposition_type=1;
4431
4432     printf("testing 5/3 DWT\n");
4433     for(i=0; i<width*height; i++)
4434         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4435
4436     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4437     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4438
4439     for(i=0; i<width*height; i++)
4440         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4441
4442     printf("testing 9/7 DWT\n");
4443     s.spatial_decomposition_type=0;
4444     for(i=0; i<width*height; i++)
4445         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4446
4447     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4448     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4449
4450     for(i=0; i<width*height; i++)
4451         if(ABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4452
4453 #if 0
4454     printf("testing AC coder\n");
4455     memset(s.header_state, 0, sizeof(s.header_state));
4456     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4457     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4458
4459     for(i=-256; i<256; i++){
4460 START_TIMER
4461         put_symbol(&s.c, s.header_state, i*i*i/3*ABS(i), 1);
4462 STOP_TIMER("put_symbol")
4463     }
4464     ff_rac_terminate(&s.c);
4465
4466     memset(s.header_state, 0, sizeof(s.header_state));
4467     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4468     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4469
4470     for(i=-256; i<256; i++){
4471         int j;
4472 START_TIMER
4473         j= get_symbol(&s.c, s.header_state, 1);
4474 STOP_TIMER("get_symbol")
4475         if(j!=i*i*i/3*ABS(i)) printf("fsck: %d != %d\n", i, j);
4476     }
4477 #endif
4478 {
4479 int level, orientation, x, y;
4480 int64_t errors[8][4];
4481 int64_t g=0;
4482
4483     memset(errors, 0, sizeof(errors));
4484     s.spatial_decomposition_count=3;
4485     s.spatial_decomposition_type=0;
4486     for(level=0; level<s.spatial_decomposition_count; level++){
4487         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4488             int w= width  >> (s.spatial_decomposition_count-level);
4489             int h= height >> (s.spatial_decomposition_count-level);
4490             int stride= width  << (s.spatial_decomposition_count-level);
4491             DWTELEM *buf= buffer[0];
4492             int64_t error=0;
4493
4494             if(orientation&1) buf+=w;
4495             if(orientation>1) buf+=stride>>1;
4496
4497             memset(buffer[0], 0, sizeof(int)*width*height);
4498             buf[w/2 + h/2*stride]= 256*256;
4499             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4500             for(y=0; y<height; y++){
4501                 for(x=0; x<width; x++){
4502                     int64_t d= buffer[0][x + y*width];
4503                     error += d*d;
4504                     if(ABS(width/2-x)<9 && ABS(height/2-y)<9 && level==2) printf("%8lld ", d);
4505                 }
4506                 if(ABS(height/2-y)<9 && level==2) printf("\n");
4507             }
4508             error= (int)(sqrt(error)+0.5);
4509             errors[level][orientation]= error;
4510             if(g) g=ff_gcd(g, error);
4511             else g= error;
4512         }
4513     }
4514     printf("static int const visual_weight[][4]={\n");
4515     for(level=0; level<s.spatial_decomposition_count; level++){
4516         printf("  {");
4517         for(orientation=0; orientation<4; orientation++){
4518             printf("%8lld,", errors[level][orientation]/g);
4519         }
4520         printf("},\n");
4521     }
4522     printf("};\n");
4523     {
4524             int level=2;
4525             int orientation=3;
4526             int w= width  >> (s.spatial_decomposition_count-level);
4527             int h= height >> (s.spatial_decomposition_count-level);
4528             int stride= width  << (s.spatial_decomposition_count-level);
4529             DWTELEM *buf= buffer[0];
4530             int64_t error=0;
4531
4532             buf+=w;
4533             buf+=stride>>1;
4534
4535             memset(buffer[0], 0, sizeof(int)*width*height);
4536 #if 1
4537             for(y=0; y<height; y++){
4538                 for(x=0; x<width; x++){
4539                     int tab[4]={0,2,3,1};
4540                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4541                 }
4542             }
4543             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4544 #else
4545             for(y=0; y<h; y++){
4546                 for(x=0; x<w; x++){
4547                     buf[x + y*stride  ]=169;
4548                     buf[x + y*stride-w]=64;
4549                 }
4550             }
4551             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4552 #endif
4553             for(y=0; y<height; y++){
4554                 for(x=0; x<width; x++){
4555                     int64_t d= buffer[0][x + y*width];
4556                     error += d*d;
4557                     if(ABS(width/2-x)<9 && ABS(height/2-y)<9) printf("%8lld ", d);
4558                 }
4559                 if(ABS(height/2-y)<9) printf("\n");
4560             }
4561     }
4562
4563 }
4564     return 0;
4565 }
4566 #endif
4567