From fddfad7cfc2761c3262ab0c78df656eb119941e7 Mon Sep 17 00:00:00 2001 From: IanSB Date: Mon, 19 Apr 2021 22:32:37 +0100 Subject: [PATCH] Reduce CPU usage when FFOSD not in use in 12bpp modes --- ...pture_line_default_twelvebits_8bpp_16bpp.S | 41 +++++++++++++++++-- src/capture_line_fast_twelvebits_8bpp_16bpp.S | 40 ++++++++++++++++-- src/macros.S | 26 ++++++++++-- src/rgb_to_fb.S | 2 - 4 files changed, 95 insertions(+), 14 deletions(-) diff --git a/src/capture_line_default_twelvebits_8bpp_16bpp.S b/src/capture_line_default_twelvebits_8bpp_16bpp.S index 6aa32dfc..eef049dd 100644 --- a/src/capture_line_default_twelvebits_8bpp_16bpp.S +++ b/src/capture_line_default_twelvebits_8bpp_16bpp.S @@ -94,24 +94,27 @@ preload_capture_line_default_eightbits_8bpp: capture_line_default_twelvebits_16bpp: push {lr} SETUP_VSYNC_DEBUG_16BPP_R11 + tst r3, #BIT_OSD + bne OSD_capture_line_default_twelvebits_16bpp + SKIP_PSYNC_NO_OLD_CPLD SETUP_TWELVE_BITS_MASK_R14 loop_16bpp: WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r5 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r6 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r7 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r10 // input in r8 @@ -121,6 +124,36 @@ loop_16bpp: bne loop_16bpp pop {r0, pc} + +OSD_capture_line_default_twelvebits_16bpp: + SKIP_PSYNC_NO_OLD_CPLD + SETUP_TWELVE_BITS_MASK_R14 +OSD_loop_16bpp: + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r5 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r6 // input in r8 + + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r7 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r10 // input in r8 + + WRITE_R5_R6_R7_R10_16BPP + + subs r1, r1, #1 + bne OSD_loop_16bpp + + pop {r0, pc} + preload_capture_line_default_twelvebits_16bpp: SETUP_DUMMY_PARAMETERS b capture_line_default_twelvebits_16bpp diff --git a/src/capture_line_fast_twelvebits_8bpp_16bpp.S b/src/capture_line_fast_twelvebits_8bpp_16bpp.S index 5e496709..271882cd 100644 --- a/src/capture_line_fast_twelvebits_8bpp_16bpp.S +++ b/src/capture_line_fast_twelvebits_8bpp_16bpp.S @@ -94,24 +94,27 @@ preload_capture_line_fast_eightbits_8bpp: capture_line_fast_twelvebits_16bpp: push {lr} SETUP_VSYNC_DEBUG_16BPP_R11 + tst r3, #BIT_OSD + bne OSD_capture_line_fast_twelvebits_16bpp + SKIP_PSYNC_NO_OLD_CPLD_FAST SETUP_TWELVE_BITS_MASK_R14 loop_16bpp: WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r5 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r6 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r7 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 - CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 CAPTURE_TWELVE_BITS_16BPP_HI r10 // input in r8 stmia r0!, {r5, r6, r7, r10} @@ -122,6 +125,35 @@ loop_16bpp: mov r0, r2 pop {pc} +OSD_capture_line_fast_twelvebits_16bpp: + SKIP_PSYNC_NO_OLD_CPLD_FAST + SETUP_TWELVE_BITS_MASK_R14 +OSD_loop_16bpp: + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r5 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r6 // input in r8 + + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r7 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_LO r11 // input in r8 + WAIT_FOR_PSYNC_EDGE_FAST // expects GPLEV0 in r4, result in r8 + OSD_CAPTURE_TWELVE_BITS_16BPP_HI r10 // input in r8 + stmia r0!, {r5, r6, r7, r10} + + subs r1, r1, #1 + bne OSD_loop_16bpp + + mov r0, r2 + pop {pc} + preload_capture_line_fast_twelvebits_16bpp: SETUP_DUMMY_PARAMETERS b capture_line_fast_twelvebits_16bpp diff --git a/src/macros.S b/src/macros.S index 428fc9b9..643434e6 100644 --- a/src/macros.S +++ b/src/macros.S @@ -798,21 +798,39 @@ wait_wr\@: // Pixel in GPIO 13.. 2 -> 15.. 0 and r9, r8, r14 eor r10, \reg, r9, lsr #(PIXEL_BASE) +.endm + +.macro CAPTURE_TWELVE_BITS_16BPP_HI reg + // Pixel in GPIO 13.. 2 -> 31.. 16 + and r9, r8, r14 + eor \reg, r10, r9, lsl #(16 - PIXEL_BASE) +.endm +.macro OSD_TEST_CAPTURE_TWELVE_BITS_16BPP_LO reg + // Pixel in GPIO 13.. 2 -> 15.. 0 + and r9, r8, r14 + eor r10, \reg, r9, lsr #(PIXEL_BASE) tst r8, #MUX_MASK - orrne r10, #0xff00 orrne r3, #BIT_PROBE +.endm + +.macro OSD_CAPTURE_TWELVE_BITS_16BPP_LO reg + // Pixel in GPIO 13.. 2 -> 15.. 0 + and r9, r8, r14 + eor r10, \reg, r9, lsr #(PIXEL_BASE) + tst r8, #MUX_MASK + orrne r3, #BIT_PROBE + orrne r10, #0xff00 orrne r10, #0x00ff .endm -.macro CAPTURE_TWELVE_BITS_16BPP_HI reg +.macro OSD_CAPTURE_TWELVE_BITS_16BPP_HI reg // Pixel in GPIO 13.. 2 -> 31.. 16 and r9, r8, r14 eor \reg, r10, r9, lsl #(16 - PIXEL_BASE) - tst r8, #MUX_MASK - orrne \reg, \reg, #(0xff000000) orrne r3, #BIT_PROBE + orrne \reg, \reg, #(0xff000000) orrne \reg, \reg, #(0x00ff0000) .endm diff --git a/src/rgb_to_fb.S b/src/rgb_to_fb.S index d571c3fc..65edbb14 100644 --- a/src/rgb_to_fb.S +++ b/src/rgb_to_fb.S @@ -681,13 +681,11 @@ skip_line_loop_exit: ldr r12, capture_address sub r12, r12, #4 // Call preload capture line function (runs all paths of capture code to preload it into cache - OSD version) - // waits for csync so loses one line blx r12 pop {r3} ldr r12, capture_address sub r12, r12, #4 // Call preload capture line function (runs all paths of capture code to preload it into cache) - // waits for csync so loses one line blx r12 pop {r1-r5, r11} mov r6, #0