/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.S[0], w6
- and w6, w6, w6, lsl #16
+ and w8, w6, w6, lsl #16
b.eq 1f
- ands w6, w6, w6, lsl #8
+ ands w8, w8, w8, lsl #8
b.ge 2f
1:
ret
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
+ and v21.16B, v21.16B, v30.16B // < beta
+ shrn v30.8b, v21.8h, #4
+ mov x7, v30.d[0]
cmhi v17.16B, v22.16B, v17.16B // < beta
- and v21.16B, v21.16B, v30.16B
cmhi v19.16B, v22.16B, v19.16B // < beta
+ cbz x7, 9f
and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B
st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0]
-
+9:
ret
endfunc
st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1
-
+9:
ret
endfunc
+
+.macro h264_loop_filter_start_intra
+ orr w4, w2, w3
+ cbnz w4, 1f
+ ret
+1:
+ sxtw x1, w1
+ dup v30.16b, w2 // alpha
+ dup v31.16b, w3 // beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+ uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
+ uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
+ uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
+ cmhi v19.16b, v30.16b, v16.16b // < alpha
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ movi v29.16b, #2
+ ushr v30.16b, v30.16b, #2 // alpha >> 2
+ add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
+ cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
+
+ and v19.16b, v19.16b, v17.16b
+ and v19.16b, v19.16b, v18.16b
+ shrn v20.8b, v19.8h, #4
+ mov x4, v20.d[0]
+ cbz x4, 9f
+
+ ushll v20.8h, v6.8b, #1
+ ushll v22.8h, v1.8b, #1
+ ushll2 v21.8h, v6.16b, #1
+ ushll2 v23.8h, v1.16b, #1
+ uaddw v20.8h, v20.8h, v7.8b
+ uaddw v22.8h, v22.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v7.16b
+ uaddw2 v23.8h, v23.8h, v0.16b
+ uaddw v20.8h, v20.8h, v1.8b
+ uaddw v22.8h, v22.8h, v6.8b
+ uaddw2 v21.8h, v21.8h, v1.16b
+ uaddw2 v23.8h, v23.8h, v6.16b
+
+ rshrn v24.8b, v20.8h, #2 // p0'_1
+ rshrn v25.8b, v22.8h, #2 // q0'_1
+ rshrn2 v24.16b, v21.8h, #2 // p0'_1
+ rshrn2 v25.16b, v23.8h, #2 // q0'_1
+
+ uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
+ uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ and v17.16b, v16.16b, v17.16b // if_2 && if_3
+ and v18.16b, v16.16b, v18.16b // if_2 && if_4
+
+ not v30.16b, v17.16b
+ not v31.16b, v18.16b
+
+ and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
+ and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
+
+ and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
+ and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
+
+ //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+ uaddl v26.8h, v5.8b, v7.8b
+ uaddl2 v27.8h, v5.16b, v7.16b
+ uaddw v26.8h, v26.8h, v0.8b
+ uaddw2 v27.8h, v27.8h, v0.16b
+ add v20.8h, v20.8h, v26.8h
+ add v21.8h, v21.8h, v27.8h
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v0.16b
+ rshrn v20.8b, v20.8h, #3 // p0'_2
+ rshrn2 v20.16b, v21.8h, #3 // p0'_2
+ uaddw v26.8h, v26.8h, v6.8b
+ uaddw2 v27.8h, v27.8h, v6.16b
+ rshrn v21.8b, v26.8h, #2 // p1'_2
+ rshrn2 v21.16b, v27.8h, #2 // p1'_2
+ uaddl v28.8h, v4.8b, v5.8b
+ uaddl2 v29.8h, v4.16b, v5.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v19.8b, v28.8h, #3 // p2'_2
+ rshrn2 v19.16b, v29.8h, #3 // p2'_2
+
+ //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+ uaddl v26.8h, v2.8b, v0.8b
+ uaddl2 v27.8h, v2.16b, v0.16b
+ uaddw v26.8h, v26.8h, v7.8b
+ uaddw2 v27.8h, v27.8h, v7.16b
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ uaddw v22.8h, v22.8h, v7.8b
+ uaddw2 v23.8h, v23.8h, v7.16b
+ rshrn v22.8b, v22.8h, #3 // q0'_2
+ rshrn2 v22.16b, v23.8h, #3 // q0'_2
+ uaddw v26.8h, v26.8h, v1.8b
+ uaddw2 v27.8h, v27.8h, v1.16b
+ rshrn v23.8b, v26.8h, #2 // q1'_2
+ rshrn2 v23.16b, v27.8h, #2 // q1'_2
+ uaddl v28.8h, v2.8b, v3.8b
+ uaddl2 v29.8h, v2.16b, v3.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v26.8b, v28.8h, #3 // q2'_2
+ rshrn2 v26.16b, v29.8h, #3 // q2'_2
+
+ bit v7.16b, v24.16b, v30.16b // p0'_1
+ bit v0.16b, v25.16b, v31.16b // q0'_1
+ bit v7.16b, v20.16b, v17.16b // p0'_2
+ bit v6.16b, v21.16b, v17.16b // p1'_2
+ bit v5.16b, v19.16b, v17.16b // p2'_2
+ bit v0.16b, v22.16b, v18.16b // q0'_2
+ bit v1.16b, v23.16b, v18.16b // q1'_2
+ bit v2.16b, v26.16b, v18.16b // q2'_2
+.endm
+
+function ff_h264_v_loop_filter_luma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ ld1 {v0.16b}, [x0], x1 // q0
+ ld1 {v1.16b}, [x0], x1 // q1
+ ld1 {v2.16b}, [x0], x1 // q2
+ ld1 {v3.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #3
+ ld1 {v4.16b}, [x0], x1 // p3
+ ld1 {v5.16b}, [x0], x1 // p2
+ ld1 {v6.16b}, [x0], x1 // p1
+ ld1 {v7.16b}, [x0] // p0
+
+ h264_loop_filter_luma_intra
+
+ sub x0, x0, x1, lsl #1
+ st1 {v5.16b}, [x0], x1 // p2
+ st1 {v6.16b}, [x0], x1 // p1
+ st1 {v7.16b}, [x0], x1 // p0
+ st1 {v0.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x0] // q2
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x0, x0, #4
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v5.8b}, [x0], x1
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v7.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v5.d}[1], [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v7.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v3.d}[1], [x0], x1
+
+ transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ h264_loop_filter_luma_intra
+
+ transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ sub x0, x0, x1, lsl #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x0], x1
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ st1 {v4.d}[1], [x0], x1
+ st1 {v5.d}[1], [x0], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ st1 {v3.d}[1], [x0], x1
+9:
+ ret
+endfunc
+
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
+ dup v23.8B, w3 // beta
uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
- uxtl v4.8H, v0.8B
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
+ uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
+ cmhi v26.8B, v22.8B, v26.8B // < alpha
+ cmhi v28.8B, v23.8B, v28.8B // < beta
+ cmhi v30.8B, v23.8B, v30.8B // < beta
+ uxtl v4.8H, v0.8B
+ and v26.8B, v26.8B, v28.8B
usubw v4.8H, v4.8H, v16.8B
- sli v24.8H, v24.8H, #8
+ and v26.8B, v26.8B, v30.8B
shl v4.8H, v4.8H, #2
- uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
+ mov x8, v26.d[0]
+ sli v24.8H, v24.8H, #8
uaddw v4.8H, v4.8H, v18.8B
- cmhi v26.8B, v22.8B, v26.8B // < alpha
+ cbz x8, 9f
usubw v4.8H, v4.8H, v2.8B
- dup v22.8B, w3 // beta
rshrn v4.8B, v4.8H, #3
- cmhi v28.8B, v22.8B, v28.8B // < beta
- cmhi v30.8B, v22.8B, v30.8B // < beta
smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B
- and v26.8B, v26.8B, v28.8B
smax v4.8B, v4.8B, v25.8B
- and v26.8B, v26.8B, v30.8B
uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B
sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1
-
+9:
ret
endfunc
sxtw x1, w1
sub x0, x0, #2
+h_loop_filter_chroma420:
ld1 {v18.S}[0], [x0], x1
ld1 {v16.S}[0], [x0], x1
ld1 {v0.S}[0], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1
-
+9:
ret
endfunc
+function ff_h264_h_loop_filter_chroma422_neon, export=1
+ sxtw x1, w1
+ h264_loop_filter_start
+ add x5, x0, x1
+ sub x0, x0, #2
+ add x1, x1, x1
+ mov x7, x30
+ bl h_loop_filter_chroma420
+ mov x30, x7
+ sub x0, x5, #2
+ mov v24.s[0], w6
+ b h_loop_filter_chroma420
+endfunc
+
+.macro h264_loop_filter_chroma_intra
+ uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
+ uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
+ uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
+ cmhi v26.8b, v30.8b, v26.8b // < alpha
+ cmhi v27.8b, v31.8b, v27.8b // < beta
+ cmhi v28.8b, v31.8b, v28.8b // < beta
+ and v26.8b, v26.8b, v27.8b
+ and v26.8b, v26.8b, v28.8b
+ mov x2, v26.d[0]
+
+ ushll v4.8h, v18.8b, #1
+ ushll v6.8h, v19.8b, #1
+ cbz x2, 9f
+ uaddl v20.8h, v16.8b, v19.8b
+ uaddl v22.8h, v17.8b, v18.8b
+ add v20.8h, v20.8h, v4.8h
+ add v22.8h, v22.8h, v6.8h
+ uqrshrn v24.8b, v20.8h, #2
+ uqrshrn v25.8b, v22.8h, #2
+ bit v16.8b, v24.8b, v26.8b
+ bit v17.8b, v25.8b, v26.8b
+.endm
+
+function ff_h264_v_loop_filter_chroma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v19.8b}, [x0]
+
+ h264_loop_filter_chroma_intra
+
+ sub x0, x0, x1, lsl #1
+ st1 {v16.8b}, [x0], x1
+ st1 {v17.8b}, [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x4, x0, #2
+ sub x0, x0, #1
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4], x1
+
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra
+
+ st2 {v16.b,v17.b}[0], [x0], x1
+ st2 {v16.b,v17.b}[1], [x0], x1
+ st2 {v16.b,v17.b}[2], [x0], x1
+ st2 {v16.b,v17.b}[3], [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x4, x0, #2
+ sub x0, x0, #1
+h_loop_filter_chroma420_intra:
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4], x1
+ ld1 {v18.s}[1], [x4], x1
+ ld1 {v16.s}[1], [x4], x1
+ ld1 {v17.s}[1], [x4], x1
+ ld1 {v19.s}[1], [x4], x1
+
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra
+
+ st2 {v16.b,v17.b}[0], [x0], x1
+ st2 {v16.b,v17.b}[1], [x0], x1
+ st2 {v16.b,v17.b}[2], [x0], x1
+ st2 {v16.b,v17.b}[3], [x0], x1
+ st2 {v16.b,v17.b}[4], [x0], x1
+ st2 {v16.b,v17.b}[5], [x0], x1
+ st2 {v16.b,v17.b}[6], [x0], x1
+ st2 {v16.b,v17.b}[7], [x0], x1
+
+9:
+ ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
+ h264_loop_filter_start_intra
+ sub x4, x0, #2
+ add x5, x0, x1, lsl #3
+ sub x0, x0, #1
+ mov x7, x30
+ bl h_loop_filter_chroma420_intra
+ sub x0, x5, #1
+ mov x30, x7
+ b h_loop_filter_chroma420_intra
+endfunc
+
.macro biweight_16 macs, macd
dup v0.16B, w5
dup v1.16B, w6