git.sesse.net Git - ffmpeg/blob - libavcodec/arm/dsputil_iwmmxt_rnd_template.c

   1 /*
   2  * iWMMXt optimized DSP utils
   3  * copyright (c) 2004 AGAWA Koji
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  23 {
  24     int stride = line_size;
  25     __asm__ volatile (
  26         "and r12, %[pixels], #7 \n\t"
  27         "bic %[pixels], %[pixels], #7 \n\t"
  28         "tmcr wcgr1, r12 \n\t"
  29         "add r4, %[pixels], %[line_size] \n\t"
  30         "add r5, %[block], %[line_size] \n\t"
  31         "mov %[line_size], %[line_size], lsl #1 \n\t"
  32         "1: \n\t"
  33         "wldrd wr0, [%[pixels]] \n\t"
  34         "subs %[h], %[h], #2 \n\t"
  35         "wldrd wr1, [%[pixels], #8] \n\t"
  36         "add %[pixels], %[pixels], %[line_size] \n\t"
  37         "wldrd wr3, [r4] \n\t"
  38         "pld [%[pixels]] \n\t"
  39         "pld [%[pixels], #32] \n\t"
  40         "wldrd wr4, [r4, #8] \n\t"
  41         "add r4, r4, %[line_size] \n\t"
  42         "walignr1 wr8, wr0, wr1 \n\t"
  43         "pld [r4] \n\t"
  44         "pld [r4, #32] \n\t"
  45         "walignr1 wr10, wr3, wr4 \n\t"
  46         "wstrd wr8, [%[block]] \n\t"
  47         "add %[block], %[block], %[line_size] \n\t"
  48         "wstrd wr10, [r5] \n\t"
  49         "add r5, r5, %[line_size] \n\t"
  50         "bne 1b \n\t"
  51         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  52         :
  53         : "memory", "r4", "r5", "r12");
  54 }
  55
  56 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  57 {
  58     int stride = line_size;
  59     __asm__ volatile (
  60         "and r12, %[pixels], #7 \n\t"
  61         "bic %[pixels], %[pixels], #7 \n\t"
  62         "tmcr wcgr1, r12 \n\t"
  63         "add r4, %[pixels], %[line_size] \n\t"
  64         "add r5, %[block], %[line_size] \n\t"
  65         "mov %[line_size], %[line_size], lsl #1 \n\t"
  66         "1: \n\t"
  67         "wldrd wr0, [%[pixels]] \n\t"
  68         "subs %[h], %[h], #2 \n\t"
  69         "wldrd wr1, [%[pixels], #8] \n\t"
  70         "add %[pixels], %[pixels], %[line_size] \n\t"
  71         "wldrd wr3, [r4] \n\t"
  72         "pld [%[pixels]] \n\t"
  73         "pld [%[pixels], #32] \n\t"
  74         "wldrd wr4, [r4, #8] \n\t"
  75         "add r4, r4, %[line_size] \n\t"
  76         "walignr1 wr8, wr0, wr1 \n\t"
  77         "wldrd wr0, [%[block]] \n\t"
  78         "wldrd wr2, [r5] \n\t"
  79         "pld [r4] \n\t"
  80         "pld [r4, #32] \n\t"
  81         "walignr1 wr10, wr3, wr4 \n\t"
  82         WAVG2B" wr8, wr8, wr0 \n\t"
  83         WAVG2B" wr10, wr10, wr2 \n\t"
  84         "wstrd wr8, [%[block]] \n\t"
  85         "add %[block], %[block], %[line_size] \n\t"
  86         "wstrd wr10, [r5] \n\t"
  87         "pld [%[block]] \n\t"
  88         "pld [%[block], #32] \n\t"
  89         "add r5, r5, %[line_size] \n\t"
  90         "pld [r5] \n\t"
  91         "pld [r5, #32] \n\t"
  92         "bne 1b \n\t"
  93         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  94         :
  95         : "memory", "r4", "r5", "r12");
  96 }
  97
  98 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  99 {
 100     int stride = line_size;
 101     __asm__ volatile (
 102         "and r12, %[pixels], #7 \n\t"
 103         "bic %[pixels], %[pixels], #7 \n\t"
 104         "tmcr wcgr1, r12 \n\t"
 105         "add r4, %[pixels], %[line_size] \n\t"
 106         "add r5, %[block], %[line_size] \n\t"
 107         "mov %[line_size], %[line_size], lsl #1 \n\t"
 108         "1: \n\t"
 109         "wldrd wr0, [%[pixels]] \n\t"
 110         "wldrd wr1, [%[pixels], #8] \n\t"
 111         "subs %[h], %[h], #2 \n\t"
 112         "wldrd wr2, [%[pixels], #16] \n\t"
 113         "add %[pixels], %[pixels], %[line_size] \n\t"
 114         "wldrd wr3, [r4] \n\t"
 115         "pld [%[pixels]] \n\t"
 116         "pld [%[pixels], #32] \n\t"
 117         "walignr1 wr8, wr0, wr1 \n\t"
 118         "wldrd wr4, [r4, #8] \n\t"
 119         "walignr1 wr9, wr1, wr2 \n\t"
 120         "wldrd wr5, [r4, #16] \n\t"
 121         "add r4, r4, %[line_size] \n\t"
 122         "pld [r4] \n\t"
 123         "pld [r4, #32] \n\t"
 124         "walignr1 wr10, wr3, wr4 \n\t"
 125         "wstrd wr8, [%[block]] \n\t"
 126         "walignr1 wr11, wr4, wr5 \n\t"
 127         "wstrd wr9, [%[block], #8] \n\t"
 128         "add %[block], %[block], %[line_size] \n\t"
 129         "wstrd wr10, [r5] \n\t"
 130         "wstrd wr11, [r5, #8] \n\t"
 131         "add r5, r5, %[line_size] \n\t"
 132         "bne 1b \n\t"
 133         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
 134         :
 135         : "memory", "r4", "r5", "r12");
 136 }
 137
 138 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 139 {
 140     int stride = line_size;
 141     __asm__ volatile (
 142         "pld [%[pixels]]                \n\t"
 143         "pld [%[pixels], #32]           \n\t"
 144         "pld [%[block]]                 \n\t"
 145         "pld [%[block], #32]            \n\t"
 146         "and r12, %[pixels], #7         \n\t"
 147         "bic %[pixels], %[pixels], #7   \n\t"
 148         "tmcr wcgr1, r12                \n\t"
 149         "add r4, %[pixels], %[line_size]\n\t"
 150         "add r5, %[block], %[line_size] \n\t"
 151         "mov %[line_size], %[line_size], lsl #1 \n\t"
 152         "1:                             \n\t"
 153         "wldrd wr0, [%[pixels]]         \n\t"
 154         "wldrd wr1, [%[pixels], #8]     \n\t"
 155         "subs %[h], %[h], #2            \n\t"
 156         "wldrd wr2, [%[pixels], #16]    \n\t"
 157         "add %[pixels], %[pixels], %[line_size] \n\t"
 158         "wldrd wr3, [r4]                \n\t"
 159         "pld [%[pixels]]                \n\t"
 160         "pld [%[pixels], #32]           \n\t"
 161         "walignr1 wr8, wr0, wr1         \n\t"
 162         "wldrd wr4, [r4, #8]            \n\t"
 163         "walignr1 wr9, wr1, wr2         \n\t"
 164         "wldrd wr5, [r4, #16]           \n\t"
 165         "add r4, r4, %[line_size]       \n\t"
 166         "wldrd wr0, [%[block]]          \n\t"
 167         "pld [r4]                       \n\t"
 168         "wldrd wr1, [%[block], #8]      \n\t"
 169         "pld [r4, #32]                  \n\t"
 170         "wldrd wr2, [r5]                \n\t"
 171         "walignr1 wr10, wr3, wr4        \n\t"
 172         "wldrd wr3, [r5, #8]            \n\t"
 173         WAVG2B" wr8, wr8, wr0           \n\t"
 174         WAVG2B" wr9, wr9, wr1           \n\t"
 175         WAVG2B" wr10, wr10, wr2         \n\t"
 176         "wstrd wr8, [%[block]]          \n\t"
 177         "walignr1 wr11, wr4, wr5        \n\t"
 178         WAVG2B" wr11, wr11, wr3         \n\t"
 179         "wstrd wr9, [%[block], #8]      \n\t"
 180         "add %[block], %[block], %[line_size] \n\t"
 181         "wstrd wr10, [r5]               \n\t"
 182         "pld [%[block]]                 \n\t"
 183         "pld [%[block], #32]            \n\t"
 184         "wstrd wr11, [r5, #8]           \n\t"
 185         "add r5, r5, %[line_size]       \n\t"
 186         "pld [r5]                       \n\t"
 187         "pld [r5, #32]                  \n\t"
 188         "bne 1b \n\t"
 189         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
 190         :
 191         : "memory", "r4", "r5", "r12");
 192 }
 193
 194 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 195 {
 196     int stride = line_size;
 197     // [wr0 wr1 wr2 wr3] for previous line
 198     // [wr4 wr5 wr6 wr7] for current line
 199     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 200     __asm__ volatile(
 201         "pld [%[pixels]]                \n\t"
 202         "pld [%[pixels], #32]           \n\t"
 203         "and r12, %[pixels], #7         \n\t"
 204         "bic %[pixels], %[pixels], #7   \n\t"
 205         "tmcr wcgr1, r12                \n\t"
 206         "add r12, r12, #1               \n\t"
 207         "add r4, %[pixels], %[line_size]\n\t"
 208         "tmcr wcgr2, r12                \n\t"
 209         "add r5, %[block], %[line_size] \n\t"
 210         "mov %[line_size], %[line_size], lsl #1 \n\t"
 211
 212         "1:                             \n\t"
 213         "wldrd wr10, [%[pixels]]        \n\t"
 214         "cmp r12, #8                    \n\t"
 215         "wldrd wr11, [%[pixels], #8]    \n\t"
 216         "add %[pixels], %[pixels], %[line_size] \n\t"
 217         "wldrd wr13, [r4]               \n\t"
 218         "pld [%[pixels]]                \n\t"
 219         "wldrd wr14, [r4, #8]           \n\t"
 220         "pld [%[pixels], #32]           \n\t"
 221         "add r4, r4, %[line_size]       \n\t"
 222         "walignr1 wr0, wr10, wr11       \n\t"
 223         "pld [r4]                       \n\t"
 224         "pld [r4, #32]                  \n\t"
 225         "walignr1 wr2, wr13, wr14       \n\t"
 226         "wmoveq wr4, wr11               \n\t"
 227         "wmoveq wr6, wr14               \n\t"
 228         "walignr2ne wr4, wr10, wr11     \n\t"
 229         "walignr2ne wr6, wr13, wr14     \n\t"
 230         WAVG2B" wr0, wr0, wr4           \n\t"
 231         WAVG2B" wr2, wr2, wr6           \n\t"
 232         "wstrd wr0, [%[block]]          \n\t"
 233         "subs %[h], %[h], #2            \n\t"
 234         "wstrd wr2, [r5]                \n\t"
 235         "add %[block], %[block], %[line_size]   \n\t"
 236         "add r5, r5, %[line_size]       \n\t"
 237         "bne 1b                         \n\t"
 238         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 239         :
 240         : "r4", "r5", "r12", "memory");
 241 }
 242
 243 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 244 {
 245     int stride = line_size;
 246     // [wr0 wr1 wr2 wr3] for previous line
 247     // [wr4 wr5 wr6 wr7] for current line
 248     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 249     __asm__ volatile(
 250         "pld [%[pixels]]                \n\t"
 251         "pld [%[pixels], #32]           \n\t"
 252         "and r12, %[pixels], #7         \n\t"
 253         "bic %[pixels], %[pixels], #7   \n\t"
 254         "tmcr wcgr1, r12                \n\t"
 255         "add r12, r12, #1               \n\t"
 256         "add r4, %[pixels], %[line_size]\n\t"
 257         "tmcr wcgr2, r12                \n\t"
 258         "add r5, %[block], %[line_size] \n\t"
 259         "mov %[line_size], %[line_size], lsl #1 \n\t"
 260
 261         "1:                             \n\t"
 262         "wldrd wr10, [%[pixels]]        \n\t"
 263         "cmp r12, #8                    \n\t"
 264         "wldrd wr11, [%[pixels], #8]    \n\t"
 265         "wldrd wr12, [%[pixels], #16]   \n\t"
 266         "add %[pixels], %[pixels], %[line_size] \n\t"
 267         "wldrd wr13, [r4]               \n\t"
 268         "pld [%[pixels]]                \n\t"
 269         "wldrd wr14, [r4, #8]           \n\t"
 270         "pld [%[pixels], #32]           \n\t"
 271         "wldrd wr15, [r4, #16]          \n\t"
 272         "add r4, r4, %[line_size]       \n\t"
 273         "walignr1 wr0, wr10, wr11       \n\t"
 274         "pld [r4]                       \n\t"
 275         "pld [r4, #32]                  \n\t"
 276         "walignr1 wr1, wr11, wr12       \n\t"
 277         "walignr1 wr2, wr13, wr14       \n\t"
 278         "walignr1 wr3, wr14, wr15       \n\t"
 279         "wmoveq wr4, wr11               \n\t"
 280         "wmoveq wr5, wr12               \n\t"
 281         "wmoveq wr6, wr14               \n\t"
 282         "wmoveq wr7, wr15               \n\t"
 283         "walignr2ne wr4, wr10, wr11     \n\t"
 284         "walignr2ne wr5, wr11, wr12     \n\t"
 285         "walignr2ne wr6, wr13, wr14     \n\t"
 286         "walignr2ne wr7, wr14, wr15     \n\t"
 287         WAVG2B" wr0, wr0, wr4           \n\t"
 288         WAVG2B" wr1, wr1, wr5           \n\t"
 289         "wstrd wr0, [%[block]]          \n\t"
 290         WAVG2B" wr2, wr2, wr6           \n\t"
 291         "wstrd wr1, [%[block], #8]      \n\t"
 292         WAVG2B" wr3, wr3, wr7           \n\t"
 293         "add %[block], %[block], %[line_size]   \n\t"
 294         "wstrd wr2, [r5]                \n\t"
 295         "subs %[h], %[h], #2            \n\t"
 296         "wstrd wr3, [r5, #8]            \n\t"
 297         "add r5, r5, %[line_size]       \n\t"
 298         "bne 1b                         \n\t"
 299         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 300         :
 301         : "r4", "r5", "r12", "memory");
 302 }
 303
 304 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 305 {
 306     int stride = line_size;
 307     // [wr0 wr1 wr2 wr3] for previous line
 308     // [wr4 wr5 wr6 wr7] for current line
 309     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 310     __asm__ volatile(
 311         "pld [%[pixels]]                \n\t"
 312         "pld [%[pixels], #32]           \n\t"
 313         "pld [%[block]]                 \n\t"
 314         "pld [%[block], #32]            \n\t"
 315         "and r12, %[pixels], #7         \n\t"
 316         "bic %[pixels], %[pixels], #7   \n\t"
 317         "tmcr wcgr1, r12                \n\t"
 318         "add r12, r12, #1               \n\t"
 319         "add r4, %[pixels], %[line_size]\n\t"
 320         "tmcr wcgr2, r12                \n\t"
 321         "add r5, %[block], %[line_size] \n\t"
 322         "mov %[line_size], %[line_size], lsl #1 \n\t"
 323         "pld [r5]                       \n\t"
 324         "pld [r5, #32]                  \n\t"
 325
 326         "1:                             \n\t"
 327         "wldrd wr10, [%[pixels]]        \n\t"
 328         "cmp r12, #8                    \n\t"
 329         "wldrd wr11, [%[pixels], #8]    \n\t"
 330         "add %[pixels], %[pixels], %[line_size] \n\t"
 331         "wldrd wr13, [r4]               \n\t"
 332         "pld [%[pixels]]                \n\t"
 333         "wldrd wr14, [r4, #8]           \n\t"
 334         "pld [%[pixels], #32]           \n\t"
 335         "add r4, r4, %[line_size]       \n\t"
 336         "walignr1 wr0, wr10, wr11       \n\t"
 337         "pld [r4]                       \n\t"
 338         "pld [r4, #32]                  \n\t"
 339         "walignr1 wr2, wr13, wr14       \n\t"
 340         "wmoveq wr4, wr11               \n\t"
 341         "wmoveq wr6, wr14               \n\t"
 342         "walignr2ne wr4, wr10, wr11     \n\t"
 343         "wldrd wr10, [%[block]]         \n\t"
 344         "walignr2ne wr6, wr13, wr14     \n\t"
 345         "wldrd wr12, [r5]               \n\t"
 346         WAVG2B" wr0, wr0, wr4           \n\t"
 347         WAVG2B" wr2, wr2, wr6           \n\t"
 348         WAVG2B" wr0, wr0, wr10          \n\t"
 349         WAVG2B" wr2, wr2, wr12          \n\t"
 350         "wstrd wr0, [%[block]]          \n\t"
 351         "subs %[h], %[h], #2            \n\t"
 352         "wstrd wr2, [r5]                \n\t"
 353         "add %[block], %[block], %[line_size]   \n\t"
 354         "add r5, r5, %[line_size]       \n\t"
 355         "pld [%[block]]                 \n\t"
 356         "pld [%[block], #32]            \n\t"
 357         "pld [r5]                       \n\t"
 358         "pld [r5, #32]                  \n\t"
 359         "bne 1b                         \n\t"
 360         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 361         :
 362         : "r4", "r5", "r12", "memory");
 363 }
 364
 365 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 366 {
 367     int stride = line_size;
 368     // [wr0 wr1 wr2 wr3] for previous line
 369     // [wr4 wr5 wr6 wr7] for current line
 370     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 371     __asm__ volatile(
 372         "pld [%[pixels]]                \n\t"
 373         "pld [%[pixels], #32]           \n\t"
 374         "pld [%[block]]                 \n\t"
 375         "pld [%[block], #32]            \n\t"
 376         "and r12, %[pixels], #7         \n\t"
 377         "bic %[pixels], %[pixels], #7   \n\t"
 378         "tmcr wcgr1, r12                \n\t"
 379         "add r12, r12, #1               \n\t"
 380         "add r4, %[pixels], %[line_size]\n\t"
 381         "tmcr wcgr2, r12                \n\t"
 382         "add r5, %[block], %[line_size] \n\t"
 383         "mov %[line_size], %[line_size], lsl #1 \n\t"
 384         "pld [r5]                       \n\t"
 385         "pld [r5, #32]                  \n\t"
 386
 387         "1:                             \n\t"
 388         "wldrd wr10, [%[pixels]]        \n\t"
 389         "cmp r12, #8                    \n\t"
 390         "wldrd wr11, [%[pixels], #8]    \n\t"
 391         "wldrd wr12, [%[pixels], #16]   \n\t"
 392         "add %[pixels], %[pixels], %[line_size] \n\t"
 393         "wldrd wr13, [r4]               \n\t"
 394         "pld [%[pixels]]                \n\t"
 395         "wldrd wr14, [r4, #8]           \n\t"
 396         "pld [%[pixels], #32]           \n\t"
 397         "wldrd wr15, [r4, #16]          \n\t"
 398         "add r4, r4, %[line_size]       \n\t"
 399         "walignr1 wr0, wr10, wr11       \n\t"
 400         "pld [r4]                       \n\t"
 401         "pld [r4, #32]                  \n\t"
 402         "walignr1 wr1, wr11, wr12       \n\t"
 403         "walignr1 wr2, wr13, wr14       \n\t"
 404         "walignr1 wr3, wr14, wr15       \n\t"
 405         "wmoveq wr4, wr11               \n\t"
 406         "wmoveq wr5, wr12               \n\t"
 407         "wmoveq wr6, wr14               \n\t"
 408         "wmoveq wr7, wr15               \n\t"
 409         "walignr2ne wr4, wr10, wr11     \n\t"
 410         "walignr2ne wr5, wr11, wr12     \n\t"
 411         "walignr2ne wr6, wr13, wr14     \n\t"
 412         "walignr2ne wr7, wr14, wr15     \n\t"
 413         "wldrd wr10, [%[block]]         \n\t"
 414         WAVG2B" wr0, wr0, wr4           \n\t"
 415         "wldrd wr11, [%[block], #8]     \n\t"
 416         WAVG2B" wr1, wr1, wr5           \n\t"
 417         "wldrd wr12, [r5]               \n\t"
 418         WAVG2B" wr2, wr2, wr6           \n\t"
 419         "wldrd wr13, [r5, #8]           \n\t"
 420         WAVG2B" wr3, wr3, wr7           \n\t"
 421         WAVG2B" wr0, wr0, wr10          \n\t"
 422         WAVG2B" wr1, wr1, wr11          \n\t"
 423         WAVG2B" wr2, wr2, wr12          \n\t"
 424         WAVG2B" wr3, wr3, wr13          \n\t"
 425         "wstrd wr0, [%[block]]          \n\t"
 426         "subs %[h], %[h], #2            \n\t"
 427         "wstrd wr1, [%[block], #8]      \n\t"
 428         "add %[block], %[block], %[line_size]   \n\t"
 429         "wstrd wr2, [r5]                \n\t"
 430         "pld [%[block]]                 \n\t"
 431         "wstrd wr3, [r5, #8]            \n\t"
 432         "add r5, r5, %[line_size]       \n\t"
 433         "pld [%[block], #32]            \n\t"
 434         "pld [r5]                       \n\t"
 435         "pld [r5, #32]                  \n\t"
 436         "bne 1b                         \n\t"
 437         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 438         :
 439         :"r4", "r5", "r12", "memory");
 440 }
 441
 442 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 443 {
 444     int stride = line_size;
 445     // [wr0 wr1 wr2 wr3] for previous line
 446     // [wr4 wr5 wr6 wr7] for current line
 447     __asm__ volatile(
 448         "pld            [%[pixels]]                             \n\t"
 449         "pld            [%[pixels], #32]                        \n\t"
 450         "and            r12, %[pixels], #7                      \n\t"
 451         "tmcr           wcgr1, r12                              \n\t"
 452         "bic            %[pixels], %[pixels], #7                \n\t"
 453
 454         "wldrd          wr10, [%[pixels]]                       \n\t"
 455         "wldrd          wr11, [%[pixels], #8]                   \n\t"
 456         "pld            [%[block]]                              \n\t"
 457         "add            %[pixels], %[pixels], %[line_size]      \n\t"
 458         "walignr1       wr0, wr10, wr11                         \n\t"
 459         "pld            [%[pixels]]                             \n\t"
 460         "pld            [%[pixels], #32]                        \n\t"
 461
 462       "1:                                                       \n\t"
 463         "wldrd          wr10, [%[pixels]]                       \n\t"
 464         "wldrd          wr11, [%[pixels], #8]                   \n\t"
 465         "add            %[pixels], %[pixels], %[line_size]      \n\t"
 466         "pld            [%[pixels]]                             \n\t"
 467         "pld            [%[pixels], #32]                        \n\t"
 468         "walignr1       wr4, wr10, wr11                         \n\t"
 469         "wldrd          wr10, [%[block]]                        \n\t"
 470          WAVG2B"        wr8, wr0, wr4                           \n\t"
 471          WAVG2B"        wr8, wr8, wr10                          \n\t"
 472         "wstrd          wr8, [%[block]]                         \n\t"
 473         "add            %[block], %[block], %[line_size]        \n\t"
 474
 475         "wldrd          wr10, [%[pixels]]                       \n\t"
 476         "wldrd          wr11, [%[pixels], #8]                   \n\t"
 477         "pld            [%[block]]                              \n\t"
 478         "add            %[pixels], %[pixels], %[line_size]      \n\t"
 479         "pld            [%[pixels]]                             \n\t"
 480         "pld            [%[pixels], #32]                        \n\t"
 481         "walignr1       wr0, wr10, wr11                         \n\t"
 482         "wldrd          wr10, [%[block]]                        \n\t"
 483          WAVG2B"        wr8, wr0, wr4                           \n\t"
 484          WAVG2B"        wr8, wr8, wr10                          \n\t"
 485         "wstrd          wr8, [%[block]]                         \n\t"
 486         "add            %[block], %[block], %[line_size]        \n\t"
 487
 488         "subs           %[h], %[h], #2                          \n\t"
 489         "pld            [%[block]]                              \n\t"
 490         "bne            1b                                      \n\t"
 491         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 492         :
 493         : "cc", "memory", "r12");
 494 }
 495
 496 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 497 {
 498     int stride = line_size;
 499     // [wr0 wr1 wr2 wr3] for previous line
 500     // [wr4 wr5 wr6 wr7] for current line
 501     __asm__ volatile(
 502         "pld [%[pixels]]                \n\t"
 503         "pld [%[pixels], #32]           \n\t"
 504         "and r12, %[pixels], #7         \n\t"
 505         "tmcr wcgr1, r12                \n\t"
 506         "bic %[pixels], %[pixels], #7   \n\t"
 507
 508         "wldrd wr10, [%[pixels]]        \n\t"
 509         "wldrd wr11, [%[pixels], #8]    \n\t"
 510         "wldrd wr12, [%[pixels], #16]   \n\t"
 511         "add %[pixels], %[pixels], %[line_size] \n\t"
 512         "pld [%[pixels]]                \n\t"
 513         "pld [%[pixels], #32]           \n\t"
 514         "walignr1 wr0, wr10, wr11       \n\t"
 515         "walignr1 wr1, wr11, wr12       \n\t"
 516
 517         "1:                             \n\t"
 518         "wldrd wr10, [%[pixels]]        \n\t"
 519         "wldrd wr11, [%[pixels], #8]    \n\t"
 520         "wldrd wr12, [%[pixels], #16]   \n\t"
 521         "add %[pixels], %[pixels], %[line_size] \n\t"
 522         "pld [%[pixels]]                \n\t"
 523         "pld [%[pixels], #32]           \n\t"
 524         "walignr1 wr4, wr10, wr11       \n\t"
 525         "walignr1 wr5, wr11, wr12       \n\t"
 526         WAVG2B" wr8, wr0, wr4           \n\t"
 527         WAVG2B" wr9, wr1, wr5           \n\t"
 528         "wstrd wr8, [%[block]]          \n\t"
 529         "wstrd wr9, [%[block], #8]      \n\t"
 530         "add %[block], %[block], %[line_size]   \n\t"
 531
 532         "wldrd wr10, [%[pixels]]        \n\t"
 533         "wldrd wr11, [%[pixels], #8]    \n\t"
 534         "wldrd wr12, [%[pixels], #16]   \n\t"
 535         "add %[pixels], %[pixels], %[line_size] \n\t"
 536         "pld [%[pixels]]                \n\t"
 537         "pld [%[pixels], #32]           \n\t"
 538         "walignr1 wr0, wr10, wr11       \n\t"
 539         "walignr1 wr1, wr11, wr12       \n\t"
 540         WAVG2B" wr8, wr0, wr4           \n\t"
 541         WAVG2B" wr9, wr1, wr5           \n\t"
 542         "wstrd wr8, [%[block]]          \n\t"
 543         "wstrd wr9, [%[block], #8]      \n\t"
 544         "add %[block], %[block], %[line_size]   \n\t"
 545
 546         "subs %[h], %[h], #2            \n\t"
 547         "bne 1b                         \n\t"
 548         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 549         :
 550         : "r4", "r5", "r12", "memory");
 551 }
 552
 553 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 554 {
 555     int stride = line_size;
 556     // [wr0 wr1 wr2 wr3] for previous line
 557     // [wr4 wr5 wr6 wr7] for current line
 558     __asm__ volatile(
 559         "pld [%[pixels]]                \n\t"
 560         "pld [%[pixels], #32]           \n\t"
 561         "and r12, %[pixels], #7         \n\t"
 562         "tmcr wcgr1, r12                \n\t"
 563         "bic %[pixels], %[pixels], #7   \n\t"
 564
 565         "wldrd wr10, [%[pixels]]        \n\t"
 566         "wldrd wr11, [%[pixels], #8]    \n\t"
 567         "pld [%[block]]                 \n\t"
 568         "wldrd wr12, [%[pixels], #16]   \n\t"
 569         "add %[pixels], %[pixels], %[line_size] \n\t"
 570         "pld [%[pixels]]                \n\t"
 571         "pld [%[pixels], #32]           \n\t"
 572         "walignr1 wr0, wr10, wr11       \n\t"
 573         "walignr1 wr1, wr11, wr12       \n\t"
 574
 575         "1:                             \n\t"
 576         "wldrd wr10, [%[pixels]]        \n\t"
 577         "wldrd wr11, [%[pixels], #8]    \n\t"
 578         "wldrd wr12, [%[pixels], #16]   \n\t"
 579         "add %[pixels], %[pixels], %[line_size] \n\t"
 580         "pld [%[pixels]]                \n\t"
 581         "pld [%[pixels], #32]           \n\t"
 582         "walignr1 wr4, wr10, wr11       \n\t"
 583         "walignr1 wr5, wr11, wr12       \n\t"
 584         "wldrd wr10, [%[block]]         \n\t"
 585         "wldrd wr11, [%[block], #8]     \n\t"
 586         WAVG2B" wr8, wr0, wr4           \n\t"
 587         WAVG2B" wr9, wr1, wr5           \n\t"
 588         WAVG2B" wr8, wr8, wr10          \n\t"
 589         WAVG2B" wr9, wr9, wr11          \n\t"
 590         "wstrd wr8, [%[block]]          \n\t"
 591         "wstrd wr9, [%[block], #8]      \n\t"
 592         "add %[block], %[block], %[line_size]   \n\t"
 593
 594         "wldrd wr10, [%[pixels]]        \n\t"
 595         "wldrd wr11, [%[pixels], #8]    \n\t"
 596         "pld [%[block]]                 \n\t"
 597         "wldrd wr12, [%[pixels], #16]   \n\t"
 598         "add %[pixels], %[pixels], %[line_size] \n\t"
 599         "pld [%[pixels]]                \n\t"
 600         "pld [%[pixels], #32]           \n\t"
 601         "walignr1 wr0, wr10, wr11       \n\t"
 602         "walignr1 wr1, wr11, wr12       \n\t"
 603         "wldrd wr10, [%[block]]         \n\t"
 604         "wldrd wr11, [%[block], #8]     \n\t"
 605         WAVG2B" wr8, wr0, wr4           \n\t"
 606         WAVG2B" wr9, wr1, wr5           \n\t"
 607         WAVG2B" wr8, wr8, wr10          \n\t"
 608         WAVG2B" wr9, wr9, wr11          \n\t"
 609         "wstrd wr8, [%[block]]          \n\t"
 610         "wstrd wr9, [%[block], #8]      \n\t"
 611         "add %[block], %[block], %[line_size]   \n\t"
 612
 613         "subs %[h], %[h], #2            \n\t"
 614         "pld [%[block]]                 \n\t"
 615         "bne 1b                         \n\t"
 616         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 617         :
 618         : "r4", "r5", "r12", "memory");
 619 }
 620
 621 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 622 {
 623     // [wr0 wr1 wr2 wr3] for previous line
 624     // [wr4 wr5 wr6 wr7] for current line
 625     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 626     __asm__ volatile(
 627         "pld [%[pixels]]                \n\t"
 628         "mov r12, #2                    \n\t"
 629         "pld [%[pixels], #32]           \n\t"
 630         "tmcr wcgr0, r12                \n\t" /* for shift value */
 631         "and r12, %[pixels], #7         \n\t"
 632         "bic %[pixels], %[pixels], #7   \n\t"
 633         "tmcr wcgr1, r12                \n\t"
 634
 635         // [wr0 wr1 wr2 wr3] <= *
 636         // [wr4 wr5 wr6 wr7]
 637         "wldrd wr12, [%[pixels]]        \n\t"
 638         "add r12, r12, #1               \n\t"
 639         "wldrd wr13, [%[pixels], #8]    \n\t"
 640         "tmcr wcgr2, r12                \n\t"
 641         "add %[pixels], %[pixels], %[line_size] \n\t"
 642         "cmp r12, #8                    \n\t"
 643         "pld [%[pixels]]                \n\t"
 644         "pld [%[pixels], #32]           \n\t"
 645         "walignr1 wr2, wr12, wr13       \n\t"
 646         "wmoveq wr10, wr13              \n\t"
 647         "walignr2ne wr10, wr12, wr13    \n\t"
 648         "wunpckelub wr0, wr2            \n\t"
 649         "wunpckehub wr1, wr2            \n\t"
 650         "wunpckelub wr8, wr10           \n\t"
 651         "wunpckehub wr9, wr10           \n\t"
 652         "waddhus wr0, wr0, wr8          \n\t"
 653         "waddhus wr1, wr1, wr9          \n\t"
 654
 655         "1:                             \n\t"
 656         // [wr0 wr1 wr2 wr3]
 657         // [wr4 wr5 wr6 wr7] <= *
 658         "wldrd wr12, [%[pixels]]        \n\t"
 659         "cmp r12, #8                    \n\t"
 660         "wldrd wr13, [%[pixels], #8]    \n\t"
 661         "add %[pixels], %[pixels], %[line_size] \n\t"
 662         "walignr1 wr6, wr12, wr13       \n\t"
 663         "pld [%[pixels]]                \n\t"
 664         "pld [%[pixels], #32]           \n\t"
 665         "wmoveq wr10, wr13              \n\t"
 666         "walignr2ne wr10, wr12, wr13    \n\t"
 667         "wunpckelub wr4, wr6            \n\t"
 668         "wunpckehub wr5, wr6            \n\t"
 669         "wunpckelub wr8, wr10           \n\t"
 670         "wunpckehub wr9, wr10           \n\t"
 671         "waddhus wr4, wr4, wr8          \n\t"
 672         "waddhus wr5, wr5, wr9          \n\t"
 673         "waddhus wr8, wr0, wr4          \n\t"
 674         "waddhus wr9, wr1, wr5          \n\t"
 675         "waddhus wr8, wr8, wr15         \n\t"
 676         "waddhus wr9, wr9, wr15         \n\t"
 677         "wsrlhg wr8, wr8, wcgr0         \n\t"
 678         "wsrlhg wr9, wr9, wcgr0         \n\t"
 679         "wpackhus wr8, wr8, wr9         \n\t"
 680         "wstrd wr8, [%[block]]          \n\t"
 681         "add %[block], %[block], %[line_size]   \n\t"
 682
 683         // [wr0 wr1 wr2 wr3] <= *
 684         // [wr4 wr5 wr6 wr7]
 685         "wldrd wr12, [%[pixels]]        \n\t"
 686         "wldrd wr13, [%[pixels], #8]    \n\t"
 687         "add %[pixels], %[pixels], %[line_size] \n\t"
 688         "walignr1 wr2, wr12, wr13       \n\t"
 689         "pld [%[pixels]]                \n\t"
 690         "pld [%[pixels], #32]           \n\t"
 691         "wmoveq wr10, wr13              \n\t"
 692         "walignr2ne wr10, wr12, wr13    \n\t"
 693         "wunpckelub wr0, wr2            \n\t"
 694         "wunpckehub wr1, wr2            \n\t"
 695         "wunpckelub wr8, wr10           \n\t"
 696         "wunpckehub wr9, wr10           \n\t"
 697         "waddhus wr0, wr0, wr8          \n\t"
 698         "waddhus wr1, wr1, wr9          \n\t"
 699         "waddhus wr8, wr0, wr4          \n\t"
 700         "waddhus wr9, wr1, wr5          \n\t"
 701         "waddhus wr8, wr8, wr15         \n\t"
 702         "waddhus wr9, wr9, wr15         \n\t"
 703         "wsrlhg wr8, wr8, wcgr0         \n\t"
 704         "wsrlhg wr9, wr9, wcgr0         \n\t"
 705         "wpackhus wr8, wr8, wr9         \n\t"
 706         "subs %[h], %[h], #2            \n\t"
 707         "wstrd wr8, [%[block]]          \n\t"
 708         "add %[block], %[block], %[line_size]   \n\t"
 709         "bne 1b                         \n\t"
 710         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 711         : [line_size]"r"(line_size)
 712         : "r12", "memory");
 713 }
 714
 715 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 716 {
 717     // [wr0 wr1 wr2 wr3] for previous line
 718     // [wr4 wr5 wr6 wr7] for current line
 719     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 720     __asm__ volatile(
 721         "pld [%[pixels]]                \n\t"
 722         "mov r12, #2                    \n\t"
 723         "pld [%[pixels], #32]           \n\t"
 724         "tmcr wcgr0, r12                \n\t" /* for shift value */
 725         /* alignment */
 726         "and r12, %[pixels], #7         \n\t"
 727         "bic %[pixels], %[pixels], #7   \n\t"
 728         "tmcr wcgr1, r12                \n\t"
 729         "add r12, r12, #1               \n\t"
 730         "tmcr wcgr2, r12                \n\t"
 731
 732         // [wr0 wr1 wr2 wr3] <= *
 733         // [wr4 wr5 wr6 wr7]
 734         "wldrd wr12, [%[pixels]]        \n\t"
 735         "cmp r12, #8                    \n\t"
 736         "wldrd wr13, [%[pixels], #8]    \n\t"
 737         "wldrd wr14, [%[pixels], #16]   \n\t"
 738         "add %[pixels], %[pixels], %[line_size] \n\t"
 739         "pld [%[pixels]]                \n\t"
 740         "walignr1 wr2, wr12, wr13       \n\t"
 741         "pld [%[pixels], #32]           \n\t"
 742         "walignr1 wr3, wr13, wr14       \n\t"
 743         "wmoveq wr10, wr13              \n\t"
 744         "wmoveq wr11, wr14              \n\t"
 745         "walignr2ne wr10, wr12, wr13    \n\t"
 746         "walignr2ne wr11, wr13, wr14    \n\t"
 747         "wunpckelub wr0, wr2            \n\t"
 748         "wunpckehub wr1, wr2            \n\t"
 749         "wunpckelub wr2, wr3            \n\t"
 750         "wunpckehub wr3, wr3            \n\t"
 751         "wunpckelub wr8, wr10           \n\t"
 752         "wunpckehub wr9, wr10           \n\t"
 753         "wunpckelub wr10, wr11          \n\t"
 754         "wunpckehub wr11, wr11          \n\t"
 755         "waddhus wr0, wr0, wr8          \n\t"
 756         "waddhus wr1, wr1, wr9          \n\t"
 757         "waddhus wr2, wr2, wr10         \n\t"
 758         "waddhus wr3, wr3, wr11         \n\t"
 759
 760         "1:                             \n\t"
 761         // [wr0 wr1 wr2 wr3]
 762         // [wr4 wr5 wr6 wr7] <= *
 763         "wldrd wr12, [%[pixels]]        \n\t"
 764         "cmp r12, #8                    \n\t"
 765         "wldrd wr13, [%[pixels], #8]    \n\t"
 766         "wldrd wr14, [%[pixels], #16]   \n\t"
 767         "add %[pixels], %[pixels], %[line_size] \n\t"
 768         "walignr1 wr6, wr12, wr13       \n\t"
 769         "pld [%[pixels]]                \n\t"
 770         "pld [%[pixels], #32]           \n\t"
 771         "walignr1 wr7, wr13, wr14       \n\t"
 772         "wmoveq wr10, wr13              \n\t"
 773         "wmoveq wr11, wr14              \n\t"
 774         "walignr2ne wr10, wr12, wr13    \n\t"
 775         "walignr2ne wr11, wr13, wr14    \n\t"
 776         "wunpckelub wr4, wr6            \n\t"
 777         "wunpckehub wr5, wr6            \n\t"
 778         "wunpckelub wr6, wr7            \n\t"
 779         "wunpckehub wr7, wr7            \n\t"
 780         "wunpckelub wr8, wr10           \n\t"
 781         "wunpckehub wr9, wr10           \n\t"
 782         "wunpckelub wr10, wr11          \n\t"
 783         "wunpckehub wr11, wr11          \n\t"
 784         "waddhus wr4, wr4, wr8          \n\t"
 785         "waddhus wr5, wr5, wr9          \n\t"
 786         "waddhus wr6, wr6, wr10         \n\t"
 787         "waddhus wr7, wr7, wr11         \n\t"
 788         "waddhus wr8, wr0, wr4          \n\t"
 789         "waddhus wr9, wr1, wr5          \n\t"
 790         "waddhus wr10, wr2, wr6         \n\t"
 791         "waddhus wr11, wr3, wr7         \n\t"
 792         "waddhus wr8, wr8, wr15         \n\t"
 793         "waddhus wr9, wr9, wr15         \n\t"
 794         "waddhus wr10, wr10, wr15       \n\t"
 795         "waddhus wr11, wr11, wr15       \n\t"
 796         "wsrlhg wr8, wr8, wcgr0         \n\t"
 797         "wsrlhg wr9, wr9, wcgr0         \n\t"
 798         "wsrlhg wr10, wr10, wcgr0       \n\t"
 799         "wsrlhg wr11, wr11, wcgr0       \n\t"
 800         "wpackhus wr8, wr8, wr9         \n\t"
 801         "wpackhus wr9, wr10, wr11       \n\t"
 802         "wstrd wr8, [%[block]]          \n\t"
 803         "wstrd wr9, [%[block], #8]      \n\t"
 804         "add %[block], %[block], %[line_size]   \n\t"
 805
 806         // [wr0 wr1 wr2 wr3] <= *
 807         // [wr4 wr5 wr6 wr7]
 808         "wldrd wr12, [%[pixels]]        \n\t"
 809         "wldrd wr13, [%[pixels], #8]    \n\t"
 810         "wldrd wr14, [%[pixels], #16]   \n\t"
 811         "add %[pixels], %[pixels], %[line_size] \n\t"
 812         "walignr1 wr2, wr12, wr13       \n\t"
 813         "pld [%[pixels]]                \n\t"
 814         "pld [%[pixels], #32]           \n\t"
 815         "walignr1 wr3, wr13, wr14       \n\t"
 816         "wmoveq wr10, wr13              \n\t"
 817         "wmoveq wr11, wr14              \n\t"
 818         "walignr2ne wr10, wr12, wr13    \n\t"
 819         "walignr2ne wr11, wr13, wr14    \n\t"
 820         "wunpckelub wr0, wr2            \n\t"
 821         "wunpckehub wr1, wr2            \n\t"
 822         "wunpckelub wr2, wr3            \n\t"
 823         "wunpckehub wr3, wr3            \n\t"
 824         "wunpckelub wr8, wr10           \n\t"
 825         "wunpckehub wr9, wr10           \n\t"
 826         "wunpckelub wr10, wr11          \n\t"
 827         "wunpckehub wr11, wr11          \n\t"
 828         "waddhus wr0, wr0, wr8          \n\t"
 829         "waddhus wr1, wr1, wr9          \n\t"
 830         "waddhus wr2, wr2, wr10         \n\t"
 831         "waddhus wr3, wr3, wr11         \n\t"
 832         "waddhus wr8, wr0, wr4          \n\t"
 833         "waddhus wr9, wr1, wr5          \n\t"
 834         "waddhus wr10, wr2, wr6         \n\t"
 835         "waddhus wr11, wr3, wr7         \n\t"
 836         "waddhus wr8, wr8, wr15         \n\t"
 837         "waddhus wr9, wr9, wr15         \n\t"
 838         "waddhus wr10, wr10, wr15       \n\t"
 839         "waddhus wr11, wr11, wr15       \n\t"
 840         "wsrlhg wr8, wr8, wcgr0         \n\t"
 841         "wsrlhg wr9, wr9, wcgr0         \n\t"
 842         "wsrlhg wr10, wr10, wcgr0       \n\t"
 843         "wsrlhg wr11, wr11, wcgr0       \n\t"
 844         "wpackhus wr8, wr8, wr9         \n\t"
 845         "wpackhus wr9, wr10, wr11       \n\t"
 846         "wstrd wr8, [%[block]]          \n\t"
 847         "wstrd wr9, [%[block], #8]      \n\t"
 848         "add %[block], %[block], %[line_size]   \n\t"
 849
 850         "subs %[h], %[h], #2            \n\t"
 851         "bne 1b                         \n\t"
 852         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 853         : [line_size]"r"(line_size)
 854         : "r12", "memory");
 855 }
 856
 857 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 858 {
 859     // [wr0 wr1 wr2 wr3] for previous line
 860     // [wr4 wr5 wr6 wr7] for current line
 861     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 862     __asm__ volatile(
 863         "pld [%[block]]                 \n\t"
 864         "pld [%[block], #32]            \n\t"
 865         "pld [%[pixels]]                \n\t"
 866         "mov r12, #2                    \n\t"
 867         "pld [%[pixels], #32]           \n\t"
 868         "tmcr wcgr0, r12                \n\t" /* for shift value */
 869         "and r12, %[pixels], #7         \n\t"
 870         "bic %[pixels], %[pixels], #7   \n\t"
 871         "tmcr wcgr1, r12                \n\t"
 872
 873         // [wr0 wr1 wr2 wr3] <= *
 874         // [wr4 wr5 wr6 wr7]
 875         "wldrd wr12, [%[pixels]]        \n\t"
 876         "add r12, r12, #1               \n\t"
 877         "wldrd wr13, [%[pixels], #8]    \n\t"
 878         "tmcr wcgr2, r12                \n\t"
 879         "add %[pixels], %[pixels], %[line_size] \n\t"
 880         "cmp r12, #8                    \n\t"
 881         "pld [%[pixels]]                \n\t"
 882         "pld [%[pixels], #32]           \n\t"
 883         "walignr1 wr2, wr12, wr13       \n\t"
 884         "wmoveq wr10, wr13              \n\t"
 885         "walignr2ne wr10, wr12, wr13    \n\t"
 886         "wunpckelub wr0, wr2            \n\t"
 887         "wunpckehub wr1, wr2            \n\t"
 888         "wunpckelub wr8, wr10           \n\t"
 889         "wunpckehub wr9, wr10           \n\t"
 890         "waddhus wr0, wr0, wr8          \n\t"
 891         "waddhus wr1, wr1, wr9          \n\t"
 892
 893         "1:                             \n\t"
 894         // [wr0 wr1 wr2 wr3]
 895         // [wr4 wr5 wr6 wr7] <= *
 896         "wldrd wr12, [%[pixels]]        \n\t"
 897         "cmp r12, #8                    \n\t"
 898         "wldrd wr13, [%[pixels], #8]    \n\t"
 899         "add %[pixels], %[pixels], %[line_size] \n\t"
 900         "walignr1 wr6, wr12, wr13       \n\t"
 901         "pld [%[pixels]]                \n\t"
 902         "pld [%[pixels], #32]           \n\t"
 903         "wmoveq wr10, wr13              \n\t"
 904         "walignr2ne wr10, wr12, wr13    \n\t"
 905         "wunpckelub wr4, wr6            \n\t"
 906         "wunpckehub wr5, wr6            \n\t"
 907         "wunpckelub wr8, wr10           \n\t"
 908         "wunpckehub wr9, wr10           \n\t"
 909         "waddhus wr4, wr4, wr8          \n\t"
 910         "waddhus wr5, wr5, wr9          \n\t"
 911         "waddhus wr8, wr0, wr4          \n\t"
 912         "waddhus wr9, wr1, wr5          \n\t"
 913         "waddhus wr8, wr8, wr15         \n\t"
 914         "waddhus wr9, wr9, wr15         \n\t"
 915         "wldrd wr12, [%[block]]         \n\t"
 916         "wsrlhg wr8, wr8, wcgr0         \n\t"
 917         "wsrlhg wr9, wr9, wcgr0         \n\t"
 918         "wpackhus wr8, wr8, wr9         \n\t"
 919         WAVG2B" wr8, wr8, wr12          \n\t"
 920         "wstrd wr8, [%[block]]          \n\t"
 921         "add %[block], %[block], %[line_size]   \n\t"
 922         "wldrd wr12, [%[pixels]]        \n\t"
 923         "pld [%[block]]                 \n\t"
 924         "pld [%[block], #32]            \n\t"
 925
 926         // [wr0 wr1 wr2 wr3] <= *
 927         // [wr4 wr5 wr6 wr7]
 928         "wldrd wr13, [%[pixels], #8]    \n\t"
 929         "add %[pixels], %[pixels], %[line_size] \n\t"
 930         "walignr1 wr2, wr12, wr13       \n\t"
 931         "pld [%[pixels]]                \n\t"
 932         "pld [%[pixels], #32]           \n\t"
 933         "wmoveq wr10, wr13              \n\t"
 934         "walignr2ne wr10, wr12, wr13    \n\t"
 935         "wunpckelub wr0, wr2            \n\t"
 936         "wunpckehub wr1, wr2            \n\t"
 937         "wunpckelub wr8, wr10           \n\t"
 938         "wunpckehub wr9, wr10           \n\t"
 939         "waddhus wr0, wr0, wr8          \n\t"
 940         "waddhus wr1, wr1, wr9          \n\t"
 941         "waddhus wr8, wr0, wr4          \n\t"
 942         "waddhus wr9, wr1, wr5          \n\t"
 943         "waddhus wr8, wr8, wr15         \n\t"
 944         "waddhus wr9, wr9, wr15         \n\t"
 945         "wldrd wr12, [%[block]]         \n\t"
 946         "wsrlhg wr8, wr8, wcgr0         \n\t"
 947         "wsrlhg wr9, wr9, wcgr0         \n\t"
 948         "wpackhus wr8, wr8, wr9         \n\t"
 949         "subs %[h], %[h], #2            \n\t"
 950         WAVG2B" wr8, wr8, wr12          \n\t"
 951         "wstrd wr8, [%[block]]          \n\t"
 952         "add %[block], %[block], %[line_size]   \n\t"
 953         "pld [%[block]]                 \n\t"
 954         "pld [%[block], #32]            \n\t"
 955         "bne 1b                         \n\t"
 956         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 957         : [line_size]"r"(line_size)
 958         : "r12", "memory");
 959 }
 960
 961 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 962 {
 963     // [wr0 wr1 wr2 wr3] for previous line
 964     // [wr4 wr5 wr6 wr7] for current line
 965     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 966     __asm__ volatile(
 967         "pld [%[block]]                 \n\t"
 968         "pld [%[block], #32]            \n\t"
 969         "pld [%[pixels]]                \n\t"
 970         "mov r12, #2                    \n\t"
 971         "pld [%[pixels], #32]           \n\t"
 972         "tmcr wcgr0, r12                \n\t" /* for shift value */
 973         /* alignment */
 974         "and r12, %[pixels], #7         \n\t"
 975         "bic %[pixels], %[pixels], #7           \n\t"
 976         "tmcr wcgr1, r12                \n\t"
 977         "add r12, r12, #1               \n\t"
 978         "tmcr wcgr2, r12                \n\t"
 979
 980         // [wr0 wr1 wr2 wr3] <= *
 981         // [wr4 wr5 wr6 wr7]
 982         "wldrd wr12, [%[pixels]]        \n\t"
 983         "cmp r12, #8                    \n\t"
 984         "wldrd wr13, [%[pixels], #8]    \n\t"
 985         "wldrd wr14, [%[pixels], #16]   \n\t"
 986         "add %[pixels], %[pixels], %[line_size] \n\t"
 987         "pld [%[pixels]]                \n\t"
 988         "walignr1 wr2, wr12, wr13       \n\t"
 989         "pld [%[pixels], #32]           \n\t"
 990         "walignr1 wr3, wr13, wr14       \n\t"
 991         "wmoveq wr10, wr13              \n\t"
 992         "wmoveq wr11, wr14              \n\t"
 993         "walignr2ne wr10, wr12, wr13    \n\t"
 994         "walignr2ne wr11, wr13, wr14    \n\t"
 995         "wunpckelub wr0, wr2            \n\t"
 996         "wunpckehub wr1, wr2            \n\t"
 997         "wunpckelub wr2, wr3            \n\t"
 998         "wunpckehub wr3, wr3            \n\t"
 999         "wunpckelub wr8, wr10           \n\t"
1000         "wunpckehub wr9, wr10           \n\t"
1001         "wunpckelub wr10, wr11          \n\t"
1002         "wunpckehub wr11, wr11          \n\t"
1003         "waddhus wr0, wr0, wr8          \n\t"
1004         "waddhus wr1, wr1, wr9          \n\t"
1005         "waddhus wr2, wr2, wr10         \n\t"
1006         "waddhus wr3, wr3, wr11         \n\t"
1007
1008         "1:                             \n\t"
1009         // [wr0 wr1 wr2 wr3]
1010         // [wr4 wr5 wr6 wr7] <= *
1011         "wldrd wr12, [%[pixels]]        \n\t"
1012         "cmp r12, #8                    \n\t"
1013         "wldrd wr13, [%[pixels], #8]    \n\t"
1014         "wldrd wr14, [%[pixels], #16]   \n\t"
1015         "add %[pixels], %[pixels], %[line_size] \n\t"
1016         "walignr1 wr6, wr12, wr13       \n\t"
1017         "pld [%[pixels]]                \n\t"
1018         "pld [%[pixels], #32]           \n\t"
1019         "walignr1 wr7, wr13, wr14       \n\t"
1020         "wmoveq wr10, wr13              \n\t"
1021         "wmoveq wr11, wr14              \n\t"
1022         "walignr2ne wr10, wr12, wr13    \n\t"
1023         "walignr2ne wr11, wr13, wr14    \n\t"
1024         "wunpckelub wr4, wr6            \n\t"
1025         "wunpckehub wr5, wr6            \n\t"
1026         "wunpckelub wr6, wr7            \n\t"
1027         "wunpckehub wr7, wr7            \n\t"
1028         "wunpckelub wr8, wr10           \n\t"
1029         "wunpckehub wr9, wr10           \n\t"
1030         "wunpckelub wr10, wr11          \n\t"
1031         "wunpckehub wr11, wr11          \n\t"
1032         "waddhus wr4, wr4, wr8          \n\t"
1033         "waddhus wr5, wr5, wr9          \n\t"
1034         "waddhus wr6, wr6, wr10         \n\t"
1035         "waddhus wr7, wr7, wr11         \n\t"
1036         "waddhus wr8, wr0, wr4          \n\t"
1037         "waddhus wr9, wr1, wr5          \n\t"
1038         "waddhus wr10, wr2, wr6         \n\t"
1039         "waddhus wr11, wr3, wr7         \n\t"
1040         "waddhus wr8, wr8, wr15         \n\t"
1041         "waddhus wr9, wr9, wr15         \n\t"
1042         "waddhus wr10, wr10, wr15       \n\t"
1043         "waddhus wr11, wr11, wr15       \n\t"
1044         "wsrlhg wr8, wr8, wcgr0         \n\t"
1045         "wsrlhg wr9, wr9, wcgr0         \n\t"
1046         "wldrd wr12, [%[block]]         \n\t"
1047         "wldrd wr13, [%[block], #8]     \n\t"
1048         "wsrlhg wr10, wr10, wcgr0       \n\t"
1049         "wsrlhg wr11, wr11, wcgr0       \n\t"
1050         "wpackhus wr8, wr8, wr9         \n\t"
1051         "wpackhus wr9, wr10, wr11       \n\t"
1052         WAVG2B" wr8, wr8, wr12          \n\t"
1053         WAVG2B" wr9, wr9, wr13          \n\t"
1054         "wstrd wr8, [%[block]]          \n\t"
1055         "wstrd wr9, [%[block], #8]      \n\t"
1056         "add %[block], %[block], %[line_size]   \n\t"
1057
1058         // [wr0 wr1 wr2 wr3] <= *
1059         // [wr4 wr5 wr6 wr7]
1060         "wldrd wr12, [%[pixels]]        \n\t"
1061         "pld [%[block]]                 \n\t"
1062         "wldrd wr13, [%[pixels], #8]    \n\t"
1063         "pld [%[block], #32]            \n\t"
1064         "wldrd wr14, [%[pixels], #16]   \n\t"
1065         "add %[pixels], %[pixels], %[line_size] \n\t"
1066         "walignr1 wr2, wr12, wr13       \n\t"
1067         "pld [%[pixels]]                \n\t"
1068         "pld [%[pixels], #32]           \n\t"
1069         "walignr1 wr3, wr13, wr14       \n\t"
1070         "wmoveq wr10, wr13              \n\t"
1071         "wmoveq wr11, wr14              \n\t"
1072         "walignr2ne wr10, wr12, wr13    \n\t"
1073         "walignr2ne wr11, wr13, wr14    \n\t"
1074         "wunpckelub wr0, wr2            \n\t"
1075         "wunpckehub wr1, wr2            \n\t"
1076         "wunpckelub wr2, wr3            \n\t"
1077         "wunpckehub wr3, wr3            \n\t"
1078         "wunpckelub wr8, wr10           \n\t"
1079         "wunpckehub wr9, wr10           \n\t"
1080         "wunpckelub wr10, wr11          \n\t"
1081         "wunpckehub wr11, wr11          \n\t"
1082         "waddhus wr0, wr0, wr8          \n\t"
1083         "waddhus wr1, wr1, wr9          \n\t"
1084         "waddhus wr2, wr2, wr10         \n\t"
1085         "waddhus wr3, wr3, wr11         \n\t"
1086         "waddhus wr8, wr0, wr4          \n\t"
1087         "waddhus wr9, wr1, wr5          \n\t"
1088         "waddhus wr10, wr2, wr6         \n\t"
1089         "waddhus wr11, wr3, wr7         \n\t"
1090         "waddhus wr8, wr8, wr15         \n\t"
1091         "waddhus wr9, wr9, wr15         \n\t"
1092         "waddhus wr10, wr10, wr15       \n\t"
1093         "waddhus wr11, wr11, wr15       \n\t"
1094         "wsrlhg wr8, wr8, wcgr0         \n\t"
1095         "wsrlhg wr9, wr9, wcgr0         \n\t"
1096         "wldrd wr12, [%[block]]         \n\t"
1097         "wldrd wr13, [%[block], #8]     \n\t"
1098         "wsrlhg wr10, wr10, wcgr0       \n\t"
1099         "wsrlhg wr11, wr11, wcgr0       \n\t"
1100         "wpackhus wr8, wr8, wr9         \n\t"
1101         "wpackhus wr9, wr10, wr11       \n\t"
1102         WAVG2B" wr8, wr8, wr12          \n\t"
1103         WAVG2B" wr9, wr9, wr13          \n\t"
1104         "wstrd wr8, [%[block]]          \n\t"
1105         "wstrd wr9, [%[block], #8]      \n\t"
1106         "add %[block], %[block], %[line_size]   \n\t"
1107         "subs %[h], %[h], #2            \n\t"
1108         "pld [%[block]]                 \n\t"
1109         "pld [%[block], #32]            \n\t"
1110         "bne 1b                         \n\t"
1111         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1112         : [line_size]"r"(line_size)
1113         : "r12", "memory");
1114 }