1 ;*****************************************************************************
2 ;* dct-64.asm: x86_64 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2016 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Holger Lubitz <holger@lubitz.org>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at licensing@x264.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
41 ; in: size, m0..m7, temp, temp
44 SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
45 SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
46 SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
47 SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
49 SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
50 SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
55 padd%1 m%10, m%4 ; %10=a4
60 psub%1 m%11, m%4 ; %11=a7
67 psub%1 m%2, m%4 ; %2=a5
68 psub%1 m%5, m%3 ; %5=a6
71 padd%1 m%3, m%10 ; %3=b1
73 psub%1 m%10, m%11 ; %10=b7
75 SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
78 padd%1 m%4, m%9 ; %4=b2
80 psub%1 m%9, m%8 ; %9=b6
83 padd%1 m%8, m%2 ; %8=b3
85 psub%1 m%5, m%2 ; %5=b5
87 SWAP %2, %7, %5, %8, %9, %10
91 SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
96 padd%1 m%10, m%7 ; %9=a7
99 psub%1 m%11, m%8 ; %10=a4
101 padd%1 m%8, m%4 ; %7=a6
106 psub%1 m%4, m%3 ; %3=a5
114 psub%1 m%3, m%5 ; %2=a3
115 psub%1 m%7, m%9 ; %6=a1
118 padd%1 m%5, m%7 ; %4=b1
120 psub%1 m%10, m%7 ; %9=b7
122 SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6
123 SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4
126 padd%1 m%9, m%3 ; %8=b3
128 psub%1 m%3, m%4 ; %2=b5
130 SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7
131 SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6
132 SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5
133 SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4
143 cglobal sub8x8_dct8, 3,3,14
144 TAIL_CALL .skip_prologue, 0
145 global current_function %+ .skip_prologue
147 LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
148 LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
150 DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
152 TRANSPOSE4x4W 0,1,2,3,8
157 DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
167 TRANSPOSE4x4W 4,5,6,7,0
172 DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
182 %endmacro ; SUB8x8_DCT8
191 %macro ADD8x8_IDCT8 0
192 cglobal add8x8_idct8, 2,2,16
194 TAIL_CALL .skip_prologue, 0
195 global current_function %+ .skip_prologue
205 IDCT8_1D d,0,1,2,3,4,5,6,7,8,9
206 TRANSPOSE4x4D 0,1,2,3,8
207 TRANSPOSE4x4D 4,5,6,7,8
220 IDCT8_1D d,8,9,10,11,12,13,14,15,6,7
221 TRANSPOSE4x4D 8,9,10,11,6
222 TRANSPOSE4x4D 12,13,14,15,6
223 IDCT8_1D d,0,1,2,3,8,9,10,11,6,7
228 IDCT8_1D d,4,5,6,7,12,13,14,15,8,9
230 mova m9, [pw_pixel_max]
231 STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB]
232 STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB]
233 STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB]
234 STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB]
237 STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB]
238 STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB]
239 STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB]
240 STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB]
242 %endmacro ; ADD8x8_IDCT8
249 %else ; !HIGH_BIT_DEPTH
252 cglobal sub8x8_dct, 3,3,10
253 add r2, 4*FDEC_STRIDE
257 TAIL_CALL .skip_prologue, 0
258 global current_function %+ .skip_prologue
261 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
262 LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
263 DCT4_1D 0, 1, 2, 3, 8
264 TRANSPOSE2x4x4W 0, 1, 2, 3, 8
265 DCT4_1D 4, 5, 6, 7, 8
266 TRANSPOSE2x4x4W 4, 5, 6, 7, 8
267 DCT4_1D 0, 1, 2, 3, 8
268 STORE_DCT 0, 1, 2, 3, r0, 0
269 DCT4_1D 4, 5, 6, 7, 8
270 STORE_DCT 4, 5, 6, 7, r0, 64
273 ;-----------------------------------------------------------------------------
274 ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
275 ;-----------------------------------------------------------------------------
276 cglobal sub8x8_dct8, 3,3,11
277 add r2, 4*FDEC_STRIDE
281 TAIL_CALL .skip_prologue, 0
282 global current_function %+ .skip_prologue
285 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
286 LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
287 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
288 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
289 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
302 %define movdqa movaps
303 %define punpcklqdq movlhps
315 cglobal sub16x16_dct8, 3,3,10
317 add r2, 4*FDEC_STRIDE
320 add r1, FENC_STRIDE*8
321 add r2, FDEC_STRIDE*8
325 LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
326 LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
327 LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
328 LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
329 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
330 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
331 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
332 mova [r0-0x80+0x00], xm0
333 vextracti128 [r0+0x00], m0, 1
334 mova [r0-0x80+0x10], xm1
335 vextracti128 [r0+0x10], m1, 1
336 mova [r0-0x80+0x20], xm2
337 vextracti128 [r0+0x20], m2, 1
338 mova [r0-0x80+0x30], xm3
339 vextracti128 [r0+0x30], m3, 1
340 mova [r0-0x80+0x40], xm4
341 vextracti128 [r0+0x40], m4, 1
342 mova [r0-0x80+0x50], xm5
343 vextracti128 [r0+0x50], m5, 1
344 mova [r0-0x80+0x60], xm6
345 vextracti128 [r0+0x60], m6, 1
346 mova [r0-0x80+0x70], xm7
347 vextracti128 [r0+0x70], m7, 1
350 ;-----------------------------------------------------------------------------
351 ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
352 ;-----------------------------------------------------------------------------
353 %macro ADD8x8_IDCT8 0
354 cglobal add8x8_idct8, 2,2,11
355 add r0, 4*FDEC_STRIDE
357 TAIL_CALL .skip_prologue, 0
358 global current_function %+ .skip_prologue
369 IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
370 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
371 paddw m0, [pw_32] ; rounding for the >>6 at the end
372 IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
373 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
374 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
375 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
376 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
377 STORE_IDCT m1, m3, m5, m7
379 %endmacro ; ADD8x8_IDCT8
386 ;-----------------------------------------------------------------------------
387 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
388 ;-----------------------------------------------------------------------------
390 cglobal add8x8_idct, 2,2,11
391 add r0, 4*FDEC_STRIDE
393 TAIL_CALL .skip_prologue, 0
394 global current_function %+ .skip_prologue
401 SBUTTERFLY qdq, 0, 1, 4
402 SBUTTERFLY qdq, 2, 3, 4
407 SBUTTERFLY qdq, 4, 5, 8
408 SBUTTERFLY qdq, 6, 7, 8
409 IDCT4_1D w,0,1,2,3,8,10
410 TRANSPOSE2x4x4W 0,1,2,3,8
411 IDCT4_1D w,4,5,6,7,8,10
412 TRANSPOSE2x4x4W 4,5,6,7,8
414 IDCT4_1D w,0,1,2,3,8,10
416 IDCT4_1D w,4,5,6,7,8,10
417 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
418 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
419 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
420 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
421 STORE_IDCT m1, m3, m5, m7
430 %endif ; !HIGH_BIT_DEPTH