forked from nuttx/nuttx-update
arm64: Support for FPU profiling with procfs
Summary: To reduce the count of FPU context switching will result at a performance improve with system. it need to balance between the using of FPU and counts of FPU trap the PR submit a base method to see performance counts for the FPU with NuttX procfs Please read README.txt at chapter of FPU Support and Performance for more information Signed-off-by: qinwei1 <qinwei1@xiaomi.com>
This commit is contained in:
parent
165e266502
commit
c4f3f8801f
10 changed files with 240 additions and 30 deletions
|
@ -129,37 +129,37 @@ config ARCH_CORTEX_A53
|
|||
bool
|
||||
default n
|
||||
select ARCH_ARMV8A
|
||||
select ARM_HAVE_NEON
|
||||
select ARCH_HAVE_TRUSTZONE
|
||||
select ARCH_DCACHE
|
||||
select ARCH_ICACHE
|
||||
select ARCH_HAVE_MMU
|
||||
select ARCH_HAVE_FPU
|
||||
select ARCH_HAVE_TESTSET
|
||||
select ARM_HAVE_NEON
|
||||
|
||||
config ARCH_CORTEX_A57
|
||||
bool
|
||||
default n
|
||||
select ARCH_ARMV8A
|
||||
select ARM_HAVE_NEON
|
||||
select ARCH_HAVE_TRUSTZONE
|
||||
select ARCH_DCACHE
|
||||
select ARCH_ICACHE
|
||||
select ARCH_HAVE_MMU
|
||||
select ARCH_HAVE_FPU
|
||||
select ARCH_HAVE_TESTSET
|
||||
select ARM_HAVE_NEON
|
||||
|
||||
config ARCH_CORTEX_A72
|
||||
bool
|
||||
default n
|
||||
select ARCH_ARMV8A
|
||||
select ARM_HAVE_NEON
|
||||
select ARCH_HAVE_TRUSTZONE
|
||||
select ARCH_DCACHE
|
||||
select ARCH_ICACHE
|
||||
select ARCH_HAVE_MMU
|
||||
select ARCH_HAVE_FPU
|
||||
select ARCH_HAVE_TESTSET
|
||||
select ARM_HAVE_NEON
|
||||
|
||||
config ARCH_CORTEX_R82
|
||||
bool
|
||||
|
@ -168,7 +168,9 @@ config ARCH_CORTEX_R82
|
|||
select ARCH_DCACHE
|
||||
select ARCH_ICACHE
|
||||
select ARCH_HAVE_MPU
|
||||
select ARCH_HAVE_FPU
|
||||
select ARCH_HAVE_TESTSET
|
||||
select ARM_HAVE_NEON
|
||||
|
||||
config ARCH_FAMILY
|
||||
string
|
||||
|
|
|
@ -24,14 +24,20 @@
|
|||
|
||||
#include <nuttx/config.h>
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <debug.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <nuttx/sched.h>
|
||||
#include <nuttx/arch.h>
|
||||
#include <nuttx/fs/procfs.h>
|
||||
#include <arch/irq.h>
|
||||
|
||||
#include "sched/sched.h"
|
||||
|
@ -46,6 +52,26 @@
|
|||
***************************************************************************/
|
||||
|
||||
#define FPU_CALLEE_REGS (8)
|
||||
#define FPU_PROC_LINELEN (64 * CONFIG_SMP_NCPUS)
|
||||
|
||||
/***************************************************************************
|
||||
* Private Types
|
||||
***************************************************************************/
|
||||
|
||||
/* This structure describes one open "file" */
|
||||
|
||||
#ifdef CONFIG_FS_PROCFS_REGISTER
|
||||
|
||||
struct arm64_fpu_procfs_file_s
|
||||
{
|
||||
struct procfs_file_s base; /* Base open file structure */
|
||||
unsigned int linesize; /* Number of valid characters in line[] */
|
||||
|
||||
/* Pre-allocated buffer for formatted lines */
|
||||
|
||||
char line[FPU_PROC_LINELEN];
|
||||
};
|
||||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
* Private Data
|
||||
|
@ -54,6 +80,44 @@
|
|||
static struct fpu_reg g_idle_thread_fpu[CONFIG_SMP_NCPUS];
|
||||
static struct arm64_cpu_fpu_context g_cpu_fpu_ctx[CONFIG_SMP_NCPUS];
|
||||
|
||||
#ifdef CONFIG_FS_PROCFS_REGISTER
|
||||
|
||||
/* procfs methods */
|
||||
|
||||
static int arm64_fpu_procfs_open(struct file *filep, const char *relpath,
|
||||
int oflags, mode_t mode);
|
||||
static int arm64_fpu_procfs_close(struct file *filep);
|
||||
static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer,
|
||||
size_t buflen);
|
||||
static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf);
|
||||
|
||||
/* See include/nutts/fs/procfs.h
|
||||
* We use the old-fashioned kind of initializers so that this will compile
|
||||
* with any compiler.
|
||||
*/
|
||||
|
||||
const struct procfs_operations arm64_fpu_procfs_operations =
|
||||
{
|
||||
arm64_fpu_procfs_open, /* open */
|
||||
arm64_fpu_procfs_close, /* close */
|
||||
arm64_fpu_procfs_read, /* read */
|
||||
NULL, /* write */
|
||||
NULL, /* dup */
|
||||
NULL, /* opendir */
|
||||
NULL, /* closedir */
|
||||
NULL, /* readdir */
|
||||
NULL, /* rewinddir */
|
||||
arm64_fpu_procfs_stat /* stat */
|
||||
};
|
||||
|
||||
static const struct procfs_entry_s g_procfs_arm64_fpu =
|
||||
{
|
||||
"fpu",
|
||||
&arm64_fpu_procfs_operations
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
* Private Functions
|
||||
***************************************************************************/
|
||||
|
@ -84,6 +148,120 @@ static void arm64_fpu_access_trap_disable(void)
|
|||
ARM64_ISB();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_PROCFS_REGISTER
|
||||
|
||||
static int arm64_fpu_procfs_open(struct file *filep, const char *relpath,
|
||||
int oflags, mode_t mode)
|
||||
{
|
||||
struct arm64_fpu_procfs_file_s *priv;
|
||||
|
||||
uinfo("Open '%s'\n", relpath);
|
||||
|
||||
/* PROCFS is read-only. Any attempt to open with any kind of write
|
||||
* access is not permitted.
|
||||
*
|
||||
* REVISIT: Write-able proc files could be quite useful.
|
||||
*/
|
||||
|
||||
if (((oflags & O_WRONLY) != 0 || (oflags & O_RDONLY) == 0))
|
||||
{
|
||||
uerr("ERROR: Only O_RDONLY supported\n");
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
/* Allocate the open file structure */
|
||||
|
||||
priv = (struct arm64_fpu_procfs_file_s *)kmm_zalloc(
|
||||
sizeof(struct arm64_fpu_procfs_file_s));
|
||||
if (priv == NULL)
|
||||
{
|
||||
uerr("ERROR: Failed to allocate file attributes\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* Save the open file structure as the open-specific state in
|
||||
* filep->f_priv.
|
||||
*/
|
||||
|
||||
filep->f_priv = (void *)priv;
|
||||
return OK;
|
||||
}
|
||||
|
||||
static int arm64_fpu_procfs_close(struct file *filep)
|
||||
{
|
||||
struct arm64_fpu_procfs_file_s *priv;
|
||||
|
||||
/* Recover our private data from the struct file instance */
|
||||
|
||||
priv = (struct arm64_fpu_procfs_file_s *)filep->f_priv;
|
||||
DEBUGASSERT(priv);
|
||||
|
||||
/* Release the file attributes structure */
|
||||
|
||||
kmm_free(priv);
|
||||
filep->f_priv = NULL;
|
||||
return OK;
|
||||
}
|
||||
|
||||
static ssize_t arm64_fpu_procfs_read(struct file *filep, char *buffer,
|
||||
size_t buflen)
|
||||
{
|
||||
struct arm64_fpu_procfs_file_s *attr;
|
||||
struct arm64_cpu_fpu_context *ctx;
|
||||
off_t offset;
|
||||
int linesize;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
uinfo("buffer=%p buflen=%zu\n", buffer, buflen);
|
||||
|
||||
/* Recover our private data from the struct file instance */
|
||||
|
||||
attr = (struct arm64_fpu_procfs_file_s *)filep->f_priv;
|
||||
DEBUGASSERT(attr);
|
||||
|
||||
/* Traverse all FPU context */
|
||||
|
||||
linesize = 0;
|
||||
for (i = 0; i < CONFIG_SMP_NCPUS; i++)
|
||||
{
|
||||
ctx = &g_cpu_fpu_ctx[i];
|
||||
linesize += snprintf(attr->line + linesize,
|
||||
FPU_PROC_LINELEN,
|
||||
"CPU%d: save: %d restore: %d "
|
||||
"switch: %d exedepth: %d\n",
|
||||
i, ctx->save_count, ctx->restore_count,
|
||||
ctx->switch_count, ctx->exe_depth_count);
|
||||
}
|
||||
|
||||
attr->linesize = linesize;
|
||||
|
||||
/* Transfer the system up time to user receive buffer */
|
||||
|
||||
offset = filep->f_pos;
|
||||
ret = procfs_memcpy(attr->line, attr->linesize,
|
||||
buffer, buflen, &offset);
|
||||
|
||||
/* Update the file offset */
|
||||
|
||||
if (ret > 0)
|
||||
{
|
||||
filep->f_pos += ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int arm64_fpu_procfs_stat(const char *relpath, struct stat *buf)
|
||||
{
|
||||
buf->st_mode = S_IFREG | S_IROTH | S_IRGRP | S_IRUSR;
|
||||
buf->st_size = 0;
|
||||
buf->st_blksize = 0;
|
||||
buf->st_blocks = 0;
|
||||
return OK;
|
||||
}
|
||||
#endif
|
||||
|
||||
/***************************************************************************
|
||||
* Public Functions
|
||||
***************************************************************************/
|
||||
|
@ -258,3 +436,18 @@ bool up_fpucmp(const void *saveregs1, const void *saveregs2)
|
|||
return memcmp(®s1[FPU_REG_Q4], ®s2[FPU_REG_Q4],
|
||||
8 * FPU_CALLEE_REGS) == 0;
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
* Name: arm64_fpu_procfs_register
|
||||
*
|
||||
* Description:
|
||||
* Register the arm64 fpu procfs file system entry
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef CONFIG_FS_PROCFS_REGISTER
|
||||
int arm64_fpu_procfs_register(void)
|
||||
{
|
||||
return procfs_register(&g_procfs_arm64_fpu);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -63,6 +63,10 @@ struct arm64_cpu_fpu_context
|
|||
void arm64_init_fpu(struct tcb_s *tcb);
|
||||
void arm64_destory_fpu(struct tcb_s *tcb);
|
||||
|
||||
#ifdef CONFIG_FS_PROCFS_REGISTER
|
||||
int arm64_fpu_procfs_register(void);
|
||||
#endif
|
||||
|
||||
void arm64_fpu_disable(void);
|
||||
void arm64_fpu_enable(void);
|
||||
|
||||
|
|
|
@ -218,5 +218,10 @@ void up_initialize(void)
|
|||
g_fpu_panic_block.notifier_call = arm64_panic_disable_fpu;
|
||||
g_fpu_panic_block.priority = INT_MAX;
|
||||
panic_notifier_chain_register(&g_fpu_panic_block);
|
||||
|
||||
#ifdef CONFIG_FS_PROCFS_REGISTER
|
||||
arm64_fpu_procfs_register();
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -294,7 +294,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver)
|
|||
|
||||
tcb->xcp.saved_reg = tcb->xcp.regs;
|
||||
#ifdef CONFIG_ARCH_FPU
|
||||
tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs;
|
||||
tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs;
|
||||
#endif
|
||||
arm64_init_signal_process(tcb);
|
||||
|
||||
|
@ -341,7 +341,7 @@ void up_schedule_sigaction(struct tcb_s *tcb, sig_deliver_t sigdeliver)
|
|||
tcb->xcp.sigdeliver = sigdeliver;
|
||||
|
||||
#ifdef CONFIG_ARCH_FPU
|
||||
tcb->xcp.sig_save_fpu_regs = tcb->xcp.fpu_regs;
|
||||
tcb->xcp.saved_fpu_regs = tcb->xcp.fpu_regs;
|
||||
#endif
|
||||
tcb->xcp.saved_reg = tcb->xcp.regs;
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y
|
|||
CONFIG_EXAMPLES_HELLO=y
|
||||
CONFIG_EXPERIMENTAL=y
|
||||
CONFIG_FS_PROCFS=y
|
||||
CONFIG_FS_PROCFS_REGISTER=y
|
||||
CONFIG_FS_ROMFS=y
|
||||
CONFIG_FVP_UART_PL011=y
|
||||
CONFIG_IDLETHREAD_STACKSIZE=8192
|
||||
|
|
|
@ -30,6 +30,7 @@ CONFIG_DEV_ZERO=y
|
|||
CONFIG_EXAMPLES_HELLO=y
|
||||
CONFIG_EXPERIMENTAL=y
|
||||
CONFIG_FS_PROCFS=y
|
||||
CONFIG_FS_PROCFS_REGISTER=y
|
||||
CONFIG_FS_ROMFS=y
|
||||
CONFIG_FVP_UART_PL011=y
|
||||
CONFIG_IDLETHREAD_STACKSIZE=8192
|
||||
|
|
|
@ -239,17 +239,7 @@ need to be considered:
|
|||
In many cases, the FPU trap is triggered by va_start() that copies
|
||||
the content of FP registers used for floating point argument passing
|
||||
into the va_list object in case there were actual float arguments from
|
||||
the caller. But In practice this is almost never the case.
|
||||
Seeing the save_count/restore_count at the g_cpu_fpu_ctx, which will
|
||||
be increase when saving/restoring FPU context. After running ostest,
|
||||
we can see the count with GDB:
|
||||
|
||||
(gdb) p g_cpu_fpu_ctx
|
||||
$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 <g_idletcb>,
|
||||
save_count = 1293, restore_count = 2226, switch_count = 4713,
|
||||
exe_depth_count = 0}}
|
||||
(gdb)
|
||||
|
||||
the caller.
|
||||
adding -mgeneral-regs-only option will make compiler not use the FPU
|
||||
register, we can use the following patch to syslog:
|
||||
|
||||
|
@ -262,24 +252,33 @@ index c58fb45512..acac6febaa
|
|||
DEPPATH += --dep-path syslog
|
||||
VPATH += :syslog
|
||||
+syslog/lib_syslog.c_CFLAGS += -mgeneral-regs-only
|
||||
|
||||
With the option to make NuttX and booting. After running ostest, see
|
||||
the count with GDB again:
|
||||
|
||||
(gdb) p g_cpu_fpu_ctx
|
||||
$1 = {{fpu_owner = 0x0, idle_thread = 0x402b3110 <g_idletcb>, save_count = 141,
|
||||
restore_count = 170, switch_count = 4715, exe_depth_count = 0}}
|
||||
(gdb)
|
||||
|
||||
it's only 141/170 for saving/restoring FPU context, which is 1293/2226 before
|
||||
add this compile option. Almost all of FPU accessing switch is argument passing
|
||||
at the syslog.
|
||||
I cannot commit the patch for NuttX mainline because it's very special case
|
||||
I cannot commit the patch for NuttX mainline because it's very special case
|
||||
since ostest is using syslog for lots of information printing. but this is
|
||||
a clue for FPU performance analysis. va_list object is using for many C code to
|
||||
handle argument passing, but if it's not passing floating point argument indeed.
|
||||
Add the option to your code maybe increase FPU performance
|
||||
|
||||
2. memset/memcpy issue
|
||||
For improve performance, the memset/memcpy implement for libc will
|
||||
use the neon/fpu instruction/register. The FPU trap is also triggered
|
||||
in this case.
|
||||
|
||||
we can trace this issue with Procfs:
|
||||
|
||||
nsh> cat /proc/arm64fpu
|
||||
CPU0: save: 7 restore: 8 switch: 62 exedepth: 0
|
||||
nsh>
|
||||
|
||||
after ostest
|
||||
nsh> cat /proc/arm64fpu
|
||||
CPU0: save: 1329 restore: 2262 switch: 4613 exedepth: 0
|
||||
nsh>
|
||||
|
||||
Note:
|
||||
save: the counts of save for task FPU context
|
||||
restore: the counts of restore for task FPU context
|
||||
switch: the counts of task switch
|
||||
|
||||
2. FPU trap at IRQ handler
|
||||
it's probably need to handle FPU trap at IRQ routine. Exception_depth is
|
||||
handling for this case, it will inc/dec at enter/leave exception. If the
|
||||
|
@ -295,6 +294,10 @@ save/restore FPU context directly maybe become a solution. Linux kernel introduc
|
|||
kernel_neon_begin/kernel_neon_end function for this case. Similar function will
|
||||
be add to NuttX if this issue need to be handle.
|
||||
|
||||
3. More reading
|
||||
for Linux kernel, please reference:
|
||||
- https://www.kernel.org/doc/html/latest/arm/kernel_mode_neon.html
|
||||
|
||||
SMP Support
|
||||
===========
|
||||
1. Booting
|
||||
|
|
|
@ -31,6 +31,7 @@ CONFIG_DEV_ZERO=y
|
|||
CONFIG_EXAMPLES_HELLO=y
|
||||
CONFIG_EXPERIMENTAL=y
|
||||
CONFIG_FS_PROCFS=y
|
||||
CONFIG_FS_PROCFS_REGISTER=y
|
||||
CONFIG_FS_ROMFS=y
|
||||
CONFIG_HAVE_CXX=y
|
||||
CONFIG_HAVE_CXXINITIALIZE=y
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
# You can then do "make savedefconfig" to generate a new defconfig file that includes your
|
||||
# modifications.
|
||||
#
|
||||
# CONFIG_ARCH_FPU is not set
|
||||
CONFIG_ARCH="arm64"
|
||||
CONFIG_ARCH_ARM64=y
|
||||
CONFIG_ARCH_BOARD="qemu-armv8a"
|
||||
|
@ -32,6 +31,7 @@ CONFIG_DEV_ZERO=y
|
|||
CONFIG_EXAMPLES_HELLO=y
|
||||
CONFIG_EXPERIMENTAL=y
|
||||
CONFIG_FS_PROCFS=y
|
||||
CONFIG_FS_PROCFS_REGISTER=y
|
||||
CONFIG_FS_ROMFS=y
|
||||
CONFIG_IDLETHREAD_STACKSIZE=16384
|
||||
CONFIG_INIT_ENTRYPOINT="nsh_main"
|
||||
|
|
Loading…
Reference in a new issue