]> git.sesse.net Git - x264/blob - common/aarch64/dct-a.S
aarch64: NEON asm for missing x264_zigzag_* functions
[x264] / common / aarch64 / dct-a.S
1 /****************************************************************************
2  * dct-a.S: aarch64 transform and zigzag
3  *****************************************************************************
4  * Copyright (C) 2009-2014 x264 project
5  *
6  * Authors: David Conrad <lessen42@gmail.com>
7  *          Janne Grunau <janne-x264@jannau.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26
27 #include "asm.S"
28
29 const scan4x4_frame, align=4
30 .byte    0,1,   8,9,   2,3,   4,5
31 .byte   10,11, 16,17, 24,25, 18,19
32 .byte   12,13,  6,7,  14,15, 20,21
33 .byte   26,27, 28,29, 22,23, 30,31
34 endconst
35
36 const scan4x4_field, align=4
37 .byte    0,1,   2,3,   8,9,   4,5
38 .byte    6,7,  10,11, 12,13, 14,15
39 endconst
40
41 const sub4x4_frame, align=4
42 .byte    0,  1,  4,  8
43 .byte    5,  2,  3,  6
44 .byte    9, 12, 13, 10
45 .byte    7, 11, 14, 15
46 endconst
47
48 const sub4x4_field, align=4
49 .byte    0,  4,  1,  8
50 .byte   12,  5,  9, 13
51 .byte    2,  6, 10, 14
52 .byte    3,  7, 11, 15
53 endconst
54
55 // sum = a + (b>>shift)   sub = (a>>shift) - b
56 .macro SUMSUB_SHR shift sum sub a b t0 t1
57     sshr        \t0,  \b, #\shift
58     sshr        \t1,  \a, #\shift
59     add         \sum, \a, \t0
60     sub         \sub, \t1, \b
61 .endm
62
63 // sum = (a>>shift) + b   sub = a - (b>>shift)
64 .macro SUMSUB_SHR2 shift sum sub a b t0 t1
65     sshr        \t0,  \a, #\shift
66     sshr        \t1,  \b, #\shift
67     add         \sum, \t0, \b
68     sub         \sub, \a, \t1
69 .endm
70
71 // a += 1.5*ma   b -= 1.5*mb
72 .macro SUMSUB_15 a b ma mb t0 t1
73     sshr        \t0, \ma, #1
74     sshr        \t1, \mb, #1
75     add         \t0, \t0, \ma
76     add         \t1, \t1, \mb
77     add         \a,  \a,  \t0
78     sub         \b,  \b,  \t1
79 .endm
80
81
82 function x264_dct4x4dc_neon, export=1
83     ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
84     movi        v31.4h, #1
85     SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
86     SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
87     SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
88     SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
89     transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
90     transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
91     SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
92     SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
93     transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
94     transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
95     add         v16.4h, v4.4h,  v31.4h
96     add         v17.4h, v6.4h,  v31.4h
97     srhadd      v0.4h,  v4.4h,  v5.4h
98     shsub       v1.4h,  v16.4h, v5.4h
99     shsub       v2.4h,  v17.4h, v7.4h
100     srhadd      v3.4h,  v6.4h,  v7.4h
101     st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
102     ret
103 endfunc
104
105 function x264_idct4x4dc_neon, export=1
106     ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
107     SUMSUB_AB   v4.4h,  v5.4h,  v0.4h,  v1.4h
108     SUMSUB_AB   v6.4h,  v7.4h,  v2.4h,  v3.4h
109     SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
110     SUMSUB_AB   v3.4h,  v1.4h,  v5.4h,  v7.4h
111     transpose   v4.4h,  v6.4h,  v0.4h,  v2.4h
112     transpose   v5.4h,  v7.4h,  v1.4h,  v3.4h
113     SUMSUB_AB   v0.4h,  v2.4h,  v4.4h,  v6.4h
114     SUMSUB_AB   v1.4h,  v3.4h,  v5.4h,  v7.4h
115     transpose   v4.2s,  v5.2s,  v0.2s,  v1.2s
116     transpose   v6.2s,  v7.2s,  v2.2s,  v3.2s
117     SUMSUB_AB   v0.4h,  v1.4h,  v4.4h,  v5.4h
118     SUMSUB_AB   v3.4h,  v2.4h,  v6.4h,  v7.4h
119     st1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
120     ret
121 endfunc
122
123 .macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
124     SUMSUB_AB   \v1, \v6, \v5, \v6
125     SUMSUB_AB   \v3, \v7, \v4, \v7
126     add         \v0, \v3, \v1
127     add         \v4, \v7, \v7
128     add         \v5, \v6, \v6
129     sub         \v2, \v3, \v1
130     add         \v1, \v4, \v6
131     sub         \v3, \v7, \v5
132 .endm
133
134 function x264_sub4x4_dct_neon, export=1
135     mov         x3, #FENC_STRIDE
136     mov         x4, #FDEC_STRIDE
137     ld1        {v0.s}[0], [x1], x3
138     ld1        {v1.s}[0], [x2], x4
139     ld1        {v2.s}[0], [x1], x3
140     usubl       v16.8h, v0.8b,  v1.8b
141     ld1        {v3.s}[0], [x2], x4
142     ld1        {v4.s}[0], [x1], x3
143     usubl       v17.8h, v2.8b,  v3.8b
144     ld1        {v5.s}[0], [x2], x4
145     ld1        {v6.s}[0], [x1], x3
146     usubl       v18.8h, v4.8b,  v5.8b
147     ld1        {v7.s}[0], [x2], x4
148     usubl       v19.8h, v6.8b,  v7.8b
149
150     DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
151     transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
152     DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
153     st1        {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
154     ret
155 endfunc
156
157 function x264_sub8x4_dct_neon
158     ld1        {v0.8b}, [x1], x3
159     ld1        {v1.8b}, [x2], x4
160     usubl       v16.8h, v0.8b,  v1.8b
161     ld1        {v2.8b}, [x1], x3
162     ld1        {v3.8b}, [x2], x4
163     usubl       v17.8h, v2.8b,  v3.8b
164     ld1        {v4.8b}, [x1], x3
165     ld1        {v5.8b}, [x2], x4
166     usubl       v18.8h, v4.8b,  v5.8b
167     ld1        {v6.8b}, [x1], x3
168     ld1        {v7.8b}, [x2], x4
169     usubl       v19.8h, v6.8b,  v7.8b
170
171     DCT_1D      v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
172     transpose4x8.h  v0, v1, v2, v3, v4, v5, v6, v7
173
174     SUMSUB_AB   v16.8h, v19.8h, v0.8h,  v3.8h
175     SUMSUB_AB   v17.8h, v18.8h, v1.8h,  v2.8h
176     add         v22.8h, v19.8h, v19.8h
177     add         v21.8h, v18.8h, v18.8h
178     add         v0.8h,  v16.8h, v17.8h
179     sub         v1.8h,  v16.8h, v17.8h
180
181     add         v2.8h,  v22.8h, v18.8h
182     sub         v3.8h,  v19.8h, v21.8h
183
184     zip1        v4.2d,  v0.2d,  v2.2d
185     zip2        v6.2d,  v0.2d,  v2.2d
186     zip1        v5.2d,  v1.2d,  v3.2d
187     zip2        v7.2d,  v1.2d,  v3.2d
188
189     st1        {v4.8h}, [x0], #16
190     st1        {v5.8h}, [x0], #16
191     st1        {v6.8h}, [x0], #16
192     st1        {v7.8h}, [x0], #16
193     ret
194 endfunc
195
196 function x264_sub8x8_dct_neon, export=1
197     mov         x5,  x30
198     mov         x3, #FENC_STRIDE
199     mov         x4, #FDEC_STRIDE
200     bl          x264_sub8x4_dct_neon
201     mov         x30, x5
202     b           x264_sub8x4_dct_neon
203 endfunc
204
205 function x264_sub16x16_dct_neon, export=1
206     mov         x5,  x30
207     mov         x3, #FENC_STRIDE
208     mov         x4, #FDEC_STRIDE
209     bl          x264_sub8x4_dct_neon
210     bl          x264_sub8x4_dct_neon
211     sub         x1, x1, #8*FENC_STRIDE-8
212     sub         x2, x2, #8*FDEC_STRIDE-8
213     bl          x264_sub8x4_dct_neon
214     bl          x264_sub8x4_dct_neon
215     sub         x1, x1, #8
216     sub         x2, x2, #8
217     bl          x264_sub8x4_dct_neon
218     bl          x264_sub8x4_dct_neon
219     sub         x1, x1, #8*FENC_STRIDE-8
220     sub         x2, x2, #8*FDEC_STRIDE-8
221     bl          x264_sub8x4_dct_neon
222     mov         x30, x5
223     b           x264_sub8x4_dct_neon
224 endfunc
225
226
227 .macro DCT8_1D type
228     SUMSUB_AB   v18.8h, v17.8h, v3.8h,  v4.8h   // s34/d34
229     SUMSUB_AB   v19.8h, v16.8h, v2.8h,  v5.8h   // s25/d25
230     SUMSUB_AB   v22.8h, v21.8h, v1.8h,  v6.8h   // s16/d16
231     SUMSUB_AB   v23.8h, v20.8h, v0.8h,  v7.8h   // s07/d07
232
233     SUMSUB_AB   v24.8h, v26.8h,  v23.8h, v18.8h  // a0/a2
234     SUMSUB_AB   v25.8h, v27.8h,  v22.8h, v19.8h  // a1/a3
235
236     SUMSUB_AB   v30.8h, v29.8h,  v20.8h, v17.8h  // a6/a5
237     sshr        v23.8h, v21.8h, #1
238     sshr        v18.8h, v16.8h, #1
239     add         v23.8h, v23.8h, v21.8h
240     add         v18.8h, v18.8h, v16.8h
241     sub         v30.8h, v30.8h, v23.8h
242     sub         v29.8h, v29.8h, v18.8h
243
244     SUMSUB_AB   v28.8h, v31.8h,  v21.8h, v16.8h   // a4/a7
245     sshr        v22.8h, v20.8h, #1
246     sshr        v19.8h, v17.8h, #1
247     add         v22.8h, v22.8h, v20.8h
248     add         v19.8h, v19.8h, v17.8h
249     add         v22.8h, v28.8h, v22.8h
250     add         v31.8h, v31.8h, v19.8h
251
252     SUMSUB_AB      v0.8h,  v4.8h,  v24.8h, v25.8h
253     SUMSUB_SHR  2, v1.8h,  v7.8h,  v22.8h, v31.8h, v16.8h, v17.8h
254     SUMSUB_SHR  1, v2.8h,  v6.8h,  v26.8h, v27.8h, v18.8h, v19.8h
255     SUMSUB_SHR2 2, v3.8h,  v5.8h,  v30.8h, v29.8h, v20.8h, v21.8h
256 .endm
257
258 function x264_sub8x8_dct8_neon, export=1
259     mov         x3, #FENC_STRIDE
260     mov         x4, #FDEC_STRIDE
261     ld1        {v16.8b}, [x1], x3
262     ld1        {v17.8b}, [x2], x4
263     ld1        {v18.8b}, [x1], x3
264     ld1        {v19.8b}, [x2], x4
265     usubl       v0.8h,  v16.8b, v17.8b
266     ld1        {v20.8b}, [x1], x3
267     ld1        {v21.8b}, [x2], x4
268     usubl       v1.8h,  v18.8b, v19.8b
269     ld1        {v22.8b}, [x1], x3
270     ld1        {v23.8b}, [x2], x4
271     usubl       v2.8h,  v20.8b, v21.8b
272     ld1        {v24.8b}, [x1], x3
273     ld1        {v25.8b}, [x2], x4
274     usubl       v3.8h,  v22.8b, v23.8b
275     ld1        {v26.8b}, [x1], x3
276     ld1        {v27.8b}, [x2], x4
277     usubl       v4.8h,  v24.8b, v25.8b
278     ld1        {v28.8b}, [x1], x3
279     ld1        {v29.8b}, [x2], x4
280     usubl       v5.8h,  v26.8b, v27.8b
281     ld1        {v30.8b}, [x1], x3
282     ld1        {v31.8b}, [x2], x4
283     usubl       v6.8h,  v28.8b, v29.8b
284     usubl       v7.8h,  v30.8b, v31.8b
285
286     DCT8_1D row
287     transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
288     DCT8_1D col
289
290     st1        {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
291     st1        {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
292     ret
293 endfunc
294
295 function x264_sub16x16_dct8_neon, export=1
296     mov         x7,  x30
297     bl          X(x264_sub8x8_dct8_neon)
298     sub         x1,  x1,  #FENC_STRIDE*8 - 8
299     sub         x2,  x2,  #FDEC_STRIDE*8 - 8
300     bl          X(x264_sub8x8_dct8_neon)
301     sub         x1,  x1,  #8
302     sub         x2,  x2,  #8
303     bl          X(x264_sub8x8_dct8_neon)
304     mov         x30, x7
305     sub         x1,  x1,  #FENC_STRIDE*8 - 8
306     sub         x2,  x2,  #FDEC_STRIDE*8 - 8
307     b           X(x264_sub8x8_dct8_neon)
308 endfunc
309
310
311 // First part of IDCT (minus final SUMSUB_BA)
312 .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
313     SUMSUB_AB   \d4, \d5, \d0, \d2
314     sshr        \d7, \d1, #1
315     sshr        \d6, \d3, #1
316     sub         \d7, \d7, \d3
317     add         \d6, \d6, \d1
318 .endm
319
320 function x264_add4x4_idct_neon, export=1
321     mov         x2, #FDEC_STRIDE
322     ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
323
324     IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
325     ld1        {v28.s}[0], [x0], x2
326     SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
327     SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
328
329     transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
330
331     IDCT_1D     v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
332     ld1        {v29.s}[0], [x0], x2
333     SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
334     SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
335
336     srshr       v0.4h,  v0.4h,  #6
337     srshr       v1.4h,  v1.4h,  #6
338     ld1        {v31.s}[0], [x0], x2
339     srshr       v2.4h,  v2.4h,  #6
340     srshr       v3.4h,  v3.4h,  #6
341     ld1        {v30.s}[0], [x0], x2
342
343     sub         x0,  x0,  x2,  lsl #2
344     uaddw       v0.8h,  v0.8h,  v28.8b
345     uaddw       v1.8h,  v1.8h,  v29.8b
346     uaddw       v2.8h,  v2.8h,  v30.8b
347     uaddw       v3.8h,  v3.8h,  v31.8b
348     sqxtun      v0.8b,  v0.8h
349     sqxtun      v1.8b,  v1.8h
350     sqxtun      v2.8b,  v2.8h
351     sqxtun      v3.8b,  v3.8h
352
353     st1        {v0.s}[0], [x0], x2
354     st1        {v1.s}[0], [x0], x2
355     st1        {v3.s}[0], [x0], x2
356     st1        {v2.s}[0], [x0], x2
357     ret
358 endfunc
359
360 function x264_add8x4_idct_neon, export=1
361     ld1        {v0.8h,v1.8h}, [x1], #32
362     ld1        {v2.8h,v3.8h}, [x1], #32
363     transpose   v20.2d, v21.2d, v0.2d, v2.2d
364     transpose   v22.2d, v23.2d, v1.2d, v3.2d
365     IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
366     SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
367     SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
368
369     transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
370
371     IDCT_1D     v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
372     SUMSUB_AB   v0.8h,  v3.8h,  v16.8h, v18.8h
373     SUMSUB_AB   v1.8h,  v2.8h,  v17.8h, v19.8h
374
375     srshr       v0.8h,  v0.8h,  #6
376     ld1        {v28.8b}, [x0], x2
377     srshr       v1.8h,  v1.8h,  #6
378     ld1        {v29.8b}, [x0], x2
379     srshr       v2.8h,  v2.8h,  #6
380     ld1        {v30.8b}, [x0], x2
381     srshr       v3.8h,  v3.8h,  #6
382     ld1        {v31.8b}, [x0], x2
383
384     sub         x0,  x0,  x2,  lsl #2
385     uaddw       v0.8h,  v0.8h,  v28.8b
386     uaddw       v1.8h,  v1.8h,  v29.8b
387     uaddw       v2.8h,  v2.8h,  v30.8b
388     uaddw       v3.8h,  v3.8h,  v31.8b
389
390     sqxtun      v0.8b,  v0.8h
391     sqxtun      v1.8b,  v1.8h
392     st1        {v0.8b}, [x0], x2
393     sqxtun      v2.8b,  v2.8h
394     st1        {v1.8b}, [x0], x2
395     sqxtun      v3.8b,  v3.8h
396     st1        {v2.8b}, [x0], x2
397     st1        {v3.8b}, [x0], x2
398     ret
399 endfunc
400
401 function x264_add8x8_idct_neon, export=1
402     mov             x2, #FDEC_STRIDE
403     mov             x5,  x30
404     bl              X(x264_add8x4_idct_neon)
405     mov             x30, x5
406     b               X(x264_add8x4_idct_neon)
407 endfunc
408
409 function x264_add16x16_idct_neon, export=1
410     mov             x2, #FDEC_STRIDE
411     mov             x5,  x30
412     bl              X(x264_add8x4_idct_neon)
413     bl              X(x264_add8x4_idct_neon)
414     sub             x0, x0, #8*FDEC_STRIDE-8
415     bl              X(x264_add8x4_idct_neon)
416     bl              X(x264_add8x4_idct_neon)
417     sub             x0, x0, #8
418     bl              X(x264_add8x4_idct_neon)
419     bl              X(x264_add8x4_idct_neon)
420     sub             x0, x0, #8*FDEC_STRIDE-8
421     bl              X(x264_add8x4_idct_neon)
422     mov             x30, x5
423     b               X(x264_add8x4_idct_neon)
424 endfunc
425
426 .macro IDCT8_1D type
427     SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v20.8h          // a0/a2
428 .ifc \type, row
429     ld1        {v22.8h,v23.8h}, [x1], #32
430 .endif
431     SUMSUB_SHR  1, v2.8h,  v3.8h,  v18.8h, v22.8h, v16.8h, v20.8h   // a6/a4
432     SUMSUB_AB   v16.8h, v18.8h, v21.8h, v19.8h
433     SUMSUB_15   v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h      // a7/a1
434     SUMSUB_AB   v22.8h, v23.8h, v23.8h, v17.8h
435     SUMSUB_15   v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h      // a5/a3
436
437     SUMSUB_SHR  2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h   // b3/b5
438     SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h   // b1/b7
439
440     SUMSUB_AB   v18.8h, v2.8h,  v0.8h,  v2.8h           // b0/b6
441     SUMSUB_AB   v19.8h, v3.8h,  v1.8h,  v3.8h           // b2/b4
442
443     SUMSUB_AB   v16.8h, v23.8h, v18.8h, v23.8h
444     SUMSUB_AB   v17.8h, v22.8h, v19.8h, v22.8h
445     SUMSUB_AB   v18.8h, v21.8h, v3.8h,  v21.8h
446     SUMSUB_AB   v19.8h, v20.8h, v2.8h,  v20.8h
447 .endm
448
449 function x264_add8x8_idct8_neon, export=1
450     mov         x2,  #FDEC_STRIDE
451     ld1        {v16.8h,v17.8h}, [x1], #32
452     ld1        {v18.8h,v19.8h}, [x1], #32
453     ld1        {v20.8h,v21.8h}, [x1], #32
454
455     IDCT8_1D    row
456
457     transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
458
459     IDCT8_1D    col
460
461     ld1        {v0.8b}, [x0], x2
462     srshr       v16.8h, v16.8h, #6
463     ld1        {v1.8b}, [x0], x2
464     srshr       v17.8h, v17.8h, #6
465     ld1        {v2.8b}, [x0], x2
466     srshr       v18.8h, v18.8h, #6
467     ld1        {v3.8b}, [x0], x2
468     srshr       v19.8h, v19.8h, #6
469     ld1        {v4.8b}, [x0], x2
470     srshr       v20.8h, v20.8h, #6
471     ld1        {v5.8b}, [x0], x2
472     srshr       v21.8h, v21.8h, #6
473     ld1        {v6.8b}, [x0], x2
474     srshr       v22.8h, v22.8h, #6
475     ld1        {v7.8b}, [x0], x2
476     srshr       v23.8h, v23.8h, #6
477     sub         x0,  x0,  x2,  lsl #3
478
479     uaddw       v16.8h, v16.8h, v0.8b
480     uaddw       v17.8h, v17.8h, v1.8b
481     uaddw       v18.8h, v18.8h, v2.8b
482     sqxtun      v0.8b,  v16.8h
483     sqxtun      v1.8b,  v17.8h
484     sqxtun      v2.8b,  v18.8h
485     uaddw       v19.8h, v19.8h, v3.8b
486     st1        {v0.8b}, [x0], x2
487     uaddw       v20.8h, v20.8h, v4.8b
488     st1        {v1.8b}, [x0], x2
489     uaddw       v21.8h, v21.8h, v5.8b
490     st1        {v2.8b}, [x0], x2
491     sqxtun      v3.8b,  v19.8h
492     sqxtun      v4.8b,  v20.8h
493     uaddw       v22.8h, v22.8h, v6.8b
494     uaddw       v23.8h, v23.8h, v7.8b
495     st1        {v3.8b}, [x0], x2
496     sqxtun      v5.8b,  v21.8h
497     st1        {v4.8b}, [x0], x2
498     sqxtun      v6.8b,  v22.8h
499     sqxtun      v7.8b,  v23.8h
500     st1        {v5.8b}, [x0], x2
501     st1        {v6.8b}, [x0], x2
502     st1        {v7.8b}, [x0], x2
503     ret
504 endfunc
505
506 function x264_add16x16_idct8_neon, export=1
507     mov             x7,  x30
508     bl              X(x264_add8x8_idct8_neon)
509     sub             x0,  x0,  #8*FDEC_STRIDE-8
510     bl              X(x264_add8x8_idct8_neon)
511     sub             x0,  x0,  #8
512     bl              X(x264_add8x8_idct8_neon)
513     sub             x0,  x0,  #8*FDEC_STRIDE-8
514     mov             x30, x7
515     b               X(x264_add8x8_idct8_neon)
516 endfunc
517
518 function x264_add8x8_idct_dc_neon, export=1
519     mov         x2,  #FDEC_STRIDE
520     ld1        {v16.4h}, [x1]
521     ld1        {v0.8b}, [x0], x2
522     srshr       v16.4h, v16.4h, #6
523     ld1        {v1.8b}, [x0], x2
524     dup         v20.8h, v16.h[0]
525     dup         v21.8h, v16.h[1]
526     ld1        {v2.8b}, [x0], x2
527     dup         v22.8h, v16.h[2]
528     dup         v23.8h, v16.h[3]
529     ld1        {v3.8b}, [x0], x2
530     trn1        v20.2d, v20.2d,  v21.2d
531     ld1        {v4.8b}, [x0], x2
532     trn1        v21.2d, v22.2d,  v23.2d
533     ld1        {v5.8b}, [x0], x2
534     neg         v22.8h, v20.8h
535     ld1        {v6.8b}, [x0], x2
536     neg         v23.8h, v21.8h
537     ld1        {v7.8b}, [x0], x2
538
539     sub         x0,  x0,  #8*FDEC_STRIDE
540
541     sqxtun      v20.8b,  v20.8h
542     sqxtun      v21.8b,  v21.8h
543     sqxtun      v22.8b,  v22.8h
544     sqxtun      v23.8b,  v23.8h
545
546     uqadd       v0.8b,  v0.8b,  v20.8b
547     uqadd       v1.8b,  v1.8b,  v20.8b
548     uqadd       v2.8b,  v2.8b,  v20.8b
549     uqadd       v3.8b,  v3.8b,  v20.8b
550     uqadd       v4.8b,  v4.8b,  v21.8b
551     uqadd       v5.8b,  v5.8b,  v21.8b
552     uqadd       v6.8b,  v6.8b,  v21.8b
553     uqadd       v7.8b,  v7.8b,  v21.8b
554     uqsub       v0.8b,  v0.8b,  v22.8b
555     uqsub       v1.8b,  v1.8b,  v22.8b
556     uqsub       v2.8b,  v2.8b,  v22.8b
557     uqsub       v3.8b,  v3.8b,  v22.8b
558     uqsub       v4.8b,  v4.8b,  v23.8b
559     uqsub       v5.8b,  v5.8b,  v23.8b
560     uqsub       v6.8b,  v6.8b,  v23.8b
561     uqsub       v7.8b,  v7.8b,  v23.8b
562
563     st1        {v0.8b}, [x0], x2
564     st1        {v1.8b}, [x0], x2
565     st1        {v2.8b}, [x0], x2
566     st1        {v3.8b}, [x0], x2
567     st1        {v4.8b}, [x0], x2
568     st1        {v5.8b}, [x0], x2
569     st1        {v6.8b}, [x0], x2
570     st1        {v7.8b}, [x0], x2
571     ret
572 endfunc
573
574 .macro ADD16x4_IDCT_DC dc
575     ld1         {v4.16b}, [x0], x3
576     dup         v24.8h,  \dc[0]
577     dup         v25.8h,  \dc[1]
578     ld1         {v5.16b}, [x0], x3
579     dup         v26.8h,  \dc[2]
580     dup         v27.8h,  \dc[3]
581     ld1         {v6.16b}, [x0], x3
582     trn1        v24.2d,  v24.2d,  v25.2d
583     ld1         {v7.16b}, [x0], x3
584     trn1        v25.2d,  v26.2d,  v27.2d
585     neg         v26.8h,  v24.8h
586     neg         v27.8h,  v25.8h
587
588     sqxtun      v20.8b,  v24.8h
589     sqxtun      v21.8b,  v26.8h
590     sqxtun2     v20.16b, v25.8h
591     sqxtun2     v21.16b, v27.8h
592
593     uqadd        v4.16b, v4.16b, v20.16b
594     uqadd        v5.16b, v5.16b, v20.16b
595     uqadd        v6.16b, v6.16b, v20.16b
596     uqadd        v7.16b, v7.16b, v20.16b
597
598     uqsub        v4.16b, v4.16b, v21.16b
599     uqsub        v5.16b, v5.16b, v21.16b
600     uqsub        v6.16b, v6.16b, v21.16b
601     st1         {v4.16b}, [x2], x3
602     uqsub        v7.16b, v7.16b, v21.16b
603     st1         {v5.16b}, [x2], x3
604     st1         {v6.16b}, [x2], x3
605     st1         {v7.16b}, [x2], x3
606 .endm
607
608 function x264_add16x16_idct_dc_neon, export=1
609     mov         x2,  x0
610     mov         x3,  #FDEC_STRIDE
611
612     ld1        {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
613     srshr       v0.4h,  v0.4h,  #6
614     srshr       v1.4h,  v1.4h,  #6
615
616     ADD16x4_IDCT_DC v0.h
617     srshr       v2.4h,  v2.4h,  #6
618     ADD16x4_IDCT_DC v1.h
619     srshr       v3.4h,  v3.4h,  #6
620     ADD16x4_IDCT_DC v2.h
621     ADD16x4_IDCT_DC v3.h
622     ret
623 endfunc
624
625 function x264_sub8x8_dct_dc_neon, export=1
626     mov             x3,  #FENC_STRIDE
627     mov             x4,  #FDEC_STRIDE
628     ld1        {v16.8b}, [x1], x3
629     ld1        {v17.8b}, [x2], x4
630     usubl       v16.8h,  v16.8b, v17.8b
631     ld1        {v18.8b}, [x1], x3
632     ld1        {v19.8b}, [x2], x4
633     usubl       v17.8h,  v18.8b, v19.8b
634     ld1        {v20.8b}, [x1], x3
635     ld1        {v21.8b}, [x2], x4
636     usubl       v18.8h, v20.8b, v21.8b
637     ld1        {v22.8b}, [x1], x3
638     add         v0.8h,  v16.8h, v17.8h
639     ld1        {v23.8b}, [x2], x4
640     usubl       v19.8h, v22.8b, v23.8b
641     ld1        {v24.8b}, [x1], x3
642     add         v0.8h,  v0.8h,  v18.8h
643     ld1        {v25.8b}, [x2], x4
644     usubl       v20.8h, v24.8b, v25.8b
645     ld1        {v26.8b}, [x1], x3
646     add         v0.8h,  v0.8h,  v19.8h
647     ld1        {v27.8b}, [x2], x4
648     usubl       v21.8h, v26.8b, v27.8b
649     ld1        {v28.8b}, [x1], x3
650     ld1        {v29.8b}, [x2], x4
651     usubl       v22.8h, v28.8b, v29.8b
652     ld1        {v30.8b}, [x1], x3
653     add         v1.8h,  v20.8h, v21.8h
654     ld1        {v31.8b}, [x2], x4
655     usubl       v23.8h, v30.8b, v31.8b
656     add         v1.8h,  v1.8h,  v22.8h
657     add         v1.8h,  v1.8h,  v23.8h
658
659     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
660
661     add         v0.8h,  v2.8h,  v3.8h
662     sub         v1.8h,  v2.8h,  v3.8h
663
664     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
665
666     add         v0.8h,  v2.8h,  v3.8h
667     sub         v1.8h,  v2.8h,  v3.8h
668
669     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
670
671     addp        v0.8h,  v2.8h,  v3.8h
672     addp        v0.8h,  v0.8h,  v0.8h
673
674     st1        {v0.4h}, [x0]
675     ret
676 endfunc
677
678 function x264_zigzag_interleave_8x8_cavlc_neon, export=1
679     mov        x3,  #7
680     movi       v31.4s, #1
681     ld4        {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
682     ld4        {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
683     umax       v16.8h, v0.8h,  v4.8h
684     umax       v17.8h, v1.8h,  v5.8h
685     umax       v18.8h, v2.8h,  v6.8h
686     umax       v19.8h, v3.8h,  v7.8h
687     st1        {v0.8h}, [x0],  #16
688     st1        {v4.8h}, [x0],  #16
689     umaxp      v16.8h, v16.8h, v17.8h
690     umaxp      v18.8h, v18.8h, v19.8h
691     st1        {v1.8h}, [x0],  #16
692     st1        {v5.8h}, [x0],  #16
693     umaxp      v16.8h, v16.8h, v18.8h
694     st1        {v2.8h}, [x0],  #16
695     st1        {v6.8h}, [x0],  #16
696     cmhi       v16.4s, v16.4s, v31.4s
697     st1        {v3.8h}, [x0],  #16
698     and        v16.16b, v16.16b, v31.16b
699     st1        {v7.8h}, [x0],  #16
700     st1        {v16.b}[0],    [x2],  #1
701     st1        {v16.b}[4],    [x2],  x3
702     st1        {v16.b}[8],    [x2],  #1
703     st1        {v16.b}[12],   [x2]
704     ret
705 endfunc
706
707 function x264_zigzag_scan_4x4_frame_neon, export=1
708     movrel      x2, scan4x4_frame
709     ld1        {v0.16b,v1.16b}, [x1]
710     ld1        {v16.16b,v17.16b}, [x2]
711     tbl         v2.16b, {v0.16b,v1.16b}, v16.16b
712     tbl         v3.16b, {v0.16b,v1.16b}, v17.16b
713     st1        {v2.16b,v3.16b},   [x0]
714     ret
715 endfunc
716
717 .macro zigzag_sub_4x4 f ac
718 function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
719     mov         x9,  #FENC_STRIDE
720     mov         x4,  #FDEC_STRIDE
721     movrel      x5,  sub4x4_\f
722     mov         x6,  x2
723     ld1        {v0.s}[0], [x1], x9
724     ld1        {v0.s}[1], [x1], x9
725     ld1        {v0.s}[2], [x1], x9
726     ld1        {v0.s}[3], [x1], x9
727     ld1        {v16.16b}, [x5]
728     ld1        {v1.s}[0], [x2], x4
729     ld1        {v1.s}[1], [x2], x4
730     ld1        {v1.s}[2], [x2], x4
731     ld1        {v1.s}[3], [x2], x4
732     tbl         v2.16b, {v0.16b}, v16.16b
733     tbl         v3.16b, {v1.16b}, v16.16b
734     st1        {v0.s}[0], [x6], x4
735     usubl       v4.8h,  v2.8b,  v3.8b
736 .ifc \ac, ac
737     dup         h7, v4.h[0]
738     ins         v4.h[0], wzr
739     fmov        w5,  s7
740     strh        w5,  [x3]
741 .endif
742     usubl2      v5.8h,  v2.16b, v3.16b
743     st1        {v0.s}[1], [x6], x4
744     umax        v6.8h,  v4.8h,  v5.8h
745     umaxv       h6,  v6.8h
746     st1        {v0.s}[2], [x6], x4
747     fmov        w7,  s6
748     st1        {v0.s}[3], [x6], x4
749     cmp         w7, #0
750     st1        {v4.8h,v5.8h},   [x0]
751     cset        w0, ne
752     ret
753 endfunc
754 .endm
755
756 zigzag_sub_4x4 field
757 zigzag_sub_4x4 field, ac
758 zigzag_sub_4x4 frame
759 zigzag_sub_4x4 frame, ac
760
761 function x264_zigzag_scan_4x4_field_neon, export=1
762     movrel      x2, scan4x4_field
763     ld1        {v0.8h,v1.8h},   [x1]
764     ld1        {v16.16b},       [x2]
765     tbl         v0.16b, {v0.16b}, v16.16b
766     st1        {v0.8h,v1.8h},   [x0]
767     ret
768 endfunc
769
770 function x264_zigzag_scan_8x8_frame_neon, export=1
771     movrel      x2,  scan8x8_frame
772     ld1        {v0.8h,v1.8h},   [x1], #32
773     ld1        {v2.8h,v3.8h},   [x1], #32
774     ld1        {v4.8h,v5.8h},   [x1], #32
775     ld1        {v6.8h,v7.8h},   [x1]
776     ld1        {v16.16b,v17.16b}, [x2], #32
777     ld1        {v18.16b,v19.16b}, [x2], #32
778     ld1        {v20.16b,v21.16b}, [x2], #32
779     ld1        {v22.16b,v23.16b}, [x2], #32
780     tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
781     tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
782     tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
783     tbl         v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
784     tbl         v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
785     tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
786     tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
787     tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
788     mov         v25.h[6], v4.h[0]
789     mov         v25.h[7], v5.h[0]
790     mov         v26.h[0], v4.h[1]
791     mov         v27.h[4], v7.h[0]
792     mov         v28.h[7], v4.h[4]
793     mov         v29.h[7], v3.h[6]
794     mov         v30.h[0], v2.h[7]
795     mov         v30.h[1], v3.h[7]
796     st1        {v24.8h,v25.8h}, [x0], #32
797     st1        {v26.8h,v27.8h}, [x0], #32
798     st1        {v28.8h,v29.8h}, [x0], #32
799     st1        {v30.8h,v31.8h}, [x0]
800     ret
801 endfunc
802
803 #define Z(z)   2*(z), 2*(z)+1
804 #define T(x,y) Z(x*8+y)
805 const scan8x8_frame, align=5
806     .byte T(0,0), T(1,0), T(0,1), T(0,2)
807     .byte T(1,1), T(2,0), T(3,0), T(2,1)
808     .byte T(1,2), T(0,3), T(0,4), T(1,3)
809     .byte T(2,2), T(3,1), T(4,0), T(5,0)
810     .byte T(4,1), T(3,2), T(2,3), T(1,4)
811     .byte T(0,5), T(0,6), T(1,5), T(2,4)
812 #undef T
813 #define T(x,y) Z((x-3)*8+y)
814     .byte T(3,3), T(4,2), T(5,1), T(6,0)
815     .byte T(7,0), T(6,1), T(5,2), T(4,3)
816 #undef T
817 #define T(x,y) Z((x-0)*8+y)
818     .byte T(3,4), T(2,5), T(1,6), T(0,7)
819     .byte T(1,7), T(2,6), T(3,5), T(4,4)
820 #undef T
821 #define T(x,y) Z((x-4)*8+y)
822     .byte T(5,3), T(6,2), T(7,1), T(7,2)
823     .byte T(6,3), T(5,4), T(4,5), T(3,6)
824     .byte T(2,7), T(3,7), T(4,6), T(5,5)
825     .byte T(6,4), T(7,3), T(7,4), T(6,5)
826     .byte T(5,6), T(4,7), T(5,7), T(6,6)
827     .byte T(7,5), T(7,6), T(6,7), T(7,7)
828 endconst
829
830 function x264_zigzag_scan_8x8_field_neon, export=1
831     movrel      x2,  scan8x8_field
832     ld1        {v0.8h,v1.8h},   [x1], #32
833     ld1        {v2.8h,v3.8h},   [x1], #32
834     ld1        {v4.8h,v5.8h},   [x1], #32
835     ld1        {v6.8h,v7.8h},   [x1]
836     ld1        {v16.16b,v17.16b}, [x2], #32
837     ld1        {v18.16b,v19.16b}, [x2], #32
838     ld1        {v20.16b,v21.16b}, [x2], #32
839     ld1        {v22.16b}, [x2]
840     ext         v31.16b, v7.16b, v7.16b, #4
841     tbl         v24.16b, {v0.16b,v1.16b},               v16.16b
842     tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
843     tbl         v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
844     tbl         v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
845     tbl         v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
846     tbl         v29.16b, {v4.16b,v5.16b,v6.16b},        v21.16b
847     tbl         v30.16b, {v5.16b,v6.16b,v7.16b},        v22.16b
848     ext         v31.16b, v6.16b, v31.16b, #12
849     st1        {v24.8h,v25.8h}, [x0], #32
850     st1        {v26.8h,v27.8h}, [x0], #32
851     st1        {v28.8h,v29.8h}, [x0], #32
852     st1        {v30.8h,v31.8h}, [x0]
853     ret
854 endfunc
855
856 .macro zigzag_sub8x8 f
857 function x264_zigzag_sub_8x8_\f\()_neon, export=1
858     movrel      x4,  sub8x8_\f
859     mov         x5,  #FENC_STRIDE
860     mov         x6,  #FDEC_STRIDE
861     mov         x7,  x2
862     ld1        {v0.d}[0], [x1], x5
863     ld1        {v0.d}[1], [x1], x5
864     ld1        {v1.d}[0], [x1], x5
865     ld1        {v1.d}[1], [x1], x5
866     ld1        {v2.d}[0], [x1], x5
867     ld1        {v2.d}[1], [x1], x5
868     ld1        {v3.d}[0], [x1], x5
869     ld1        {v3.d}[1], [x1]
870     ld1        {v4.d}[0], [x2], x6
871     ld1        {v4.d}[1], [x2], x6
872     ld1        {v5.d}[0], [x2], x6
873     ld1        {v5.d}[1], [x2], x6
874     ld1        {v6.d}[0], [x2], x6
875     ld1        {v6.d}[1], [x2], x6
876     ld1        {v7.d}[0], [x2], x6
877     ld1        {v7.d}[1], [x2]
878     ld1        {v16.16b,v17.16b}, [x4], #32
879     ld1        {v18.16b,v19.16b}, [x4], #32
880     tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
881     tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
882     tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
883     tbl         v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
884     tbl         v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
885     tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
886     tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
887     tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
888     usubl       v4.8h,  v24.8b,  v28.8b
889     usubl2      v5.8h,  v24.16b, v28.16b
890     usubl       v6.8h,  v25.8b,  v29.8b
891     usubl2      v7.8h,  v25.16b, v29.16b
892     usubl       v16.8h, v26.8b,  v30.8b
893     usubl2      v17.8h, v26.16b, v30.16b
894     usubl       v18.8h, v27.8b,  v31.8b
895     usubl2      v19.8h, v27.16b, v31.16b
896     umax        v20.8h, v4.8h,   v5.8h
897     umax        v21.8h, v6.8h,   v7.8h
898     umax        v22.8h, v16.8h,  v17.8h
899     umax        v23.8h, v18.8h,  v19.8h
900     umax        v20.8h, v20.8h,  v21.8h
901     umax        v21.8h, v22.8h,  v23.8h
902     umax        v20.8h, v20.8h,  v21.8h
903     umaxv       h22,    v20.8h
904     st1        {v0.d}[0], [x7], x6
905     st1        {v0.d}[1], [x7], x6
906     st1        {v1.d}[0], [x7], x6
907     st1        {v1.d}[1], [x7], x6
908     st1        {v2.d}[0], [x7], x6
909     st1        {v2.d}[1], [x7], x6
910     st1        {v3.d}[0], [x7], x6
911     st1        {v3.d}[1], [x7]
912     st1        {v4.8h,v5.8h},   [x0], #32
913     st1        {v6.8h,v7.8h},   [x0], #32
914     st1        {v16.8h,v17.8h}, [x0], #32
915     st1        {v18.8h,v19.8h}, [x0]
916     fmov        w9,  s22
917     cmp         w9, #0
918     cset        w0, ne
919     ret
920 endfunc
921 .endm
922
923 zigzag_sub8x8 field
924 zigzag_sub8x8 frame
925
926 #undef T
927 #define T(x,y) Z(x*8+y)
928 const scan8x8_field, align=5
929     .byte T(0,0), T(0,1), T(0,2), T(1,0)
930     .byte T(1,1), T(0,3), T(0,4), T(1,2)
931     .byte T(2,0), T(1,3), T(0,5), T(0,6)
932     .byte T(0,7), T(1,4), T(2,1), T(3,0)
933 #undef T
934 #define T(x,y) Z((x-1)*8+y)
935     .byte T(2,2), T(1,5), T(1,6), T(1,7)
936     .byte T(2,3), T(3,1), T(4,0), T(3,2)
937 #undef T
938 #define T(x,y) Z((x-2)*8+y)
939     .byte T(2,4), T(2,5), T(2,6), T(2,7)
940     .byte T(3,3), T(4,1), T(5,0), T(4,2)
941 #undef T
942 #define T(x,y) Z((x-3)*8+y)
943     .byte T(3,4), T(3,5), T(3,6), T(3,7)
944     .byte T(4,3), T(5,1), T(6,0), T(5,2)
945 #undef T
946 #define T(x,y) Z((x-4)*8+y)
947     .byte T(4,4), T(4,5), T(4,6), T(4,7)
948     .byte T(5,3), T(6,1), T(6,2), T(5,4)
949 #undef T
950 #define T(x,y) Z((x-5)*8+y)
951     .byte T(5,5), T(5,6), T(5,7), T(6,3)
952     .byte T(7,0), T(7,1), T(6,4), T(6,5)
953 endconst
954
955
956 #undef T
957 #define T(y,x) x*8+y
958 const sub8x8_frame, align=5
959     .byte T(0,0), T(1,0), T(0,1), T(0,2)
960     .byte T(1,1), T(2,0), T(3,0), T(2,1)
961     .byte T(1,2), T(0,3), T(0,4), T(1,3)
962     .byte T(2,2), T(3,1), T(4,0), T(5,0)
963     .byte T(4,1), T(3,2), T(2,3), T(1,4)
964     .byte T(0,5), T(0,6), T(1,5), T(2,4)
965     .byte T(3,3), T(4,2), T(5,1), T(6,0)
966     .byte T(7,0), T(6,1), T(5,2), T(4,3)
967     .byte T(3,4), T(2,5), T(1,6), T(0,7)
968     .byte T(1,7), T(2,6), T(3,5), T(4,4)
969     .byte T(5,3), T(6,2), T(7,1), T(7,2)
970     .byte T(6,3), T(5,4), T(4,5), T(3,6)
971     .byte T(2,7), T(3,7), T(4,6), T(5,5)
972     .byte T(6,4), T(7,3), T(7,4), T(6,5)
973     .byte T(5,6), T(4,7), T(5,7), T(6,6)
974     .byte T(7,5), T(7,6), T(6,7), T(7,7)
975 endconst
976
977 const sub8x8_field, align=5
978     .byte T(0,0), T(0,1), T(0,2), T(1,0)
979     .byte T(1,1), T(0,3), T(0,4), T(1,2)
980     .byte T(2,0), T(1,3), T(0,5), T(0,6)
981     .byte T(0,7), T(1,4), T(2,1), T(3,0)
982     .byte T(2,2), T(1,5), T(1,6), T(1,7)
983     .byte T(2,3), T(3,1), T(4,0), T(3,2)
984     .byte T(2,4), T(2,5), T(2,6), T(2,7)
985     .byte T(3,3), T(4,1), T(5,0), T(4,2)
986     .byte T(3,4), T(3,5), T(3,6), T(3,7)
987     .byte T(4,3), T(5,1), T(6,0), T(5,2)
988     .byte T(4,4), T(4,5), T(4,6), T(4,7)
989     .byte T(5,3), T(6,1), T(6,2), T(5,4)
990     .byte T(5,5), T(5,6), T(5,7), T(6,3)
991     .byte T(7,0), T(7,1), T(6,4), T(6,5)
992     .byte T(6,6), T(6,7), T(7,2), T(7,3)
993     .byte T(7,4), T(7,5), T(7,6), T(7,7)
994 endconst