From 5ad5e5d8f119127146c733e50ee87f95339eb2ae Mon Sep 17 00:00:00 2001
From: David Chen <david.chen@myais.com.cn>
Date: Wed, 1 Nov 2023 08:31:34 +0200
Subject: [PATCH] Improve deblock-a.S Performance by Using SVE/SVE2

Imporve the performance of NEON functions of aarch64/deblock-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.

Command executed: ./checkasm8 --bench=deblock
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
deblock_chroma[1]_c: 735
deblock_chroma[1]_neon: 427
deblock_chroma[1]_sve: 353

Command executed: ./checkasm8 --bench=deblock
Testbed: AWS Graviton3
Results:
deblock_chroma[1]_c: 719
deblock_chroma[1]_neon: 442
deblock_chroma[1]_sve: 345
---
 Makefile                       |  3 +-
 common/aarch64/deblock-a-sve.S | 98 ++++++++++++++++++++++++++++++++++
 common/aarch64/deblock.h       |  3 ++
 common/deblock.c               |  6 +++
 4 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 common/aarch64/deblock-a-sve.S

diff --git a/Makefile b/Makefile
index 3dc52230f..3f088386e 100644
--- a/Makefile
+++ b/Makefile
@@ -171,7 +171,8 @@ SRCASM_X  = common/aarch64/bitstream-a.S \
             common/aarch64/predict-a.S \
             common/aarch64/quant-a.S
 ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
-SRCASM_X += common/aarch64/dct-a-sve.S
+SRCASM_X += common/aarch64/dct-a-sve.S \
+            common/aarch64/deblock-a-sve.S
 endif
 ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
 SRCASM_X += common/aarch64/dct-a-sve2.S
diff --git a/common/aarch64/deblock-a-sve.S b/common/aarch64/deblock-a-sve.S
new file mode 100644
index 000000000..73f32572c
--- /dev/null
+++ b/common/aarch64/deblock-a-sve.S
@@ -0,0 +1,98 @@
+/*****************************************************************************
+ * deblock-a-sve.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen@myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "deblock-a-common.S"
+
+.arch armv8-a+sve
+
+.macro h264_loop_filter_chroma_sve
+    ptrue           p0.b, vl16
+
+    dup             v22.16b, w2              // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
+    uxtl            v4.8h,   v0.8b
+    uxtl2           v5.8h,   v0.16b
+    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    usubw2          v5.8h,   v5.8h,   v16.16b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    shl             v5.8h,   v5.8h,   #2
+    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
+    uxtl            v24.4s,  v24.4h
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    uaddw2          v5.8h,   v5.8h,   v18.16b
+
+    cmphi           p1.b, p0/z, z22.b, z26.b
+    usubw           v4.8h,   v4.8h,   v2.8b
+    usubw2          v5.8h,   v5.8h,   v2.16b
+    sli             v24.4s,  v24.4s,  #16
+    dup             v22.16b, w3              // beta
+    rshrn           v4.8b,   v4.8h,   #3
+    rshrn2          v4.16b,  v5.8h,   #3
+    cmphi           p2.b, p0/z, z22.b, z28.b
+    cmphi           p3.b, p0/z, z22.b, z30.b
+    smin            v4.16b,  v4.16b,  v24.16b
+    neg             v25.16b, v24.16b
+    and             p1.b, p0/z, p1.b, p2.b
+    smax            v4.16b,  v4.16b,  v25.16b
+    and             p1.b, p0/z, p1.b, p3.b
+    uxtl            v22.8h,  v0.8b
+    uxtl2           v23.8h,  v0.16b
+
+    uxtl            v28.8h,  v16.8b
+    uxtl2           v29.8h,  v16.16b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    saddw2          v29.8h,  v29.8h,  v4.16b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    ssubw2          v23.8h,  v23.8h,  v4.16b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun          v0.8b,   v22.8h
+    sqxtun2         v16.16b, v29.8h
+    sqxtun2         v0.16b,  v23.8h
+.endm
+
+function deblock_v_chroma_sve, export=1
+    h264_loop_filter_start
+
+    sub             x0,  x0,  x1, lsl #1
+    // No performance improvement if sve load is used. So, continue using
+    // NEON load here
+    ld1             {v18.16b}, [x0], x1
+    ld1             {v16.16b}, [x0], x1
+    ld1             {v0.16b},  [x0], x1
+    ld1             {v2.16b},  [x0]
+
+    h264_loop_filter_chroma_sve
+
+    sub             x0,  x0,  x1, lsl #1
+    st1b            {z16.b}, p1, [x0]
+    add             x0, x0, x1
+    st1b            {z0.b}, p1, [x0]
+
+    ret
+endfunc
diff --git a/common/aarch64/deblock.h b/common/aarch64/deblock.h
index 8eb9d036d..aae9751af 100644
--- a/common/aarch64/deblock.h
+++ b/common/aarch64/deblock.h
@@ -55,4 +55,7 @@ void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, i
 #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
 void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 
+#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
+void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+
 #endif
diff --git a/common/deblock.c b/common/deblock.c
index f53767359..c1253cf49 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -803,6 +803,12 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
         pf->deblock_strength     = x264_deblock_strength_neon;
     }
+#if HAVE_SVE
+    if ( cpu&X264_CPU_SVE )
+    {
+        pf->deblock_chroma[1] = x264_deblock_v_chroma_sve;
+    }
+#endif
 #endif
 
 #if HAVE_MSA