sched/gprof: add gprof support

gprof can analyze code hot spots based on scheduled sampling.
After adding the "-pg" parameter when compiling, you can view the code call graph.

Signed-off-by: yinshengkai <yinshengkai@xiaomi.com>
This commit is contained in:
yinshengkai 2024-03-12 18:03:39 +08:00 committed by Xiang Xiao
parent 4944bfd56c
commit eb8449cb0c
14 changed files with 742 additions and 2 deletions

View file

@ -167,6 +167,10 @@ if(CONFIG_ARCH_INSTRUMENT_ALL)
add_compile_options(-finstrument-functions)
endif()
if(CONFIG_SCHED_GPROF_ALL)
add_compile_options(-pg)
endif()
if(CONFIG_UNWINDER_ARM)
add_compile_options(-funwind-tables -fasynchronous-unwind-tables)
endif()

View file

@ -72,6 +72,10 @@ ifneq ($(CONFIG_STACK_USAGE_WARNING),0)
ARCHOPTIMIZATION += -Wstack-usage=$(CONFIG_STACK_USAGE_WARNING)
endif
ifeq ($(CONFIG_SCHED_GPROF_ALL),y)
ARCHOPTIMIZATION += -pg
endif
ifeq ($(CONFIG_MM_UBSAN_ALL),y)
ARCHOPTIMIZATION += $(CONFIG_MM_UBSAN_OPTION)
endif

View file

@ -97,6 +97,13 @@ config SIM_GCOV_ALL
This option activates code coverage instrumentation for the
entire image.
config SIM_GPROF
bool "Enable gprof"
depends on !SCHED_GPROF
default n
---help---
Enable support gprof profiling tool.
choice
prompt "X64_64 ABI"
default SIM_X8664_SYSTEMV if HOST_LINUX

View file

@ -144,6 +144,10 @@ ifeq ($(CONFIG_SCHED_GCOV),y)
STDLIBS += -lgcov
endif
ifeq ($(CONFIG_SIM_GPROF),y)
HOSTCFLAGS += -pg
endif
ifeq ($(CONFIG_STACK_COLORATION),y)
CSRCS += sim_checkstack.c
endif
@ -423,7 +427,8 @@ nuttx$(EXEEXT): libarch$(LIBEXT) board/libboard$(LIBEXT) $(HEADOBJ) $(LINKOBJS)
ifneq ($(CONFIG_HOST_MACOS),y)
$(Q) $(OBJCOPY) --redefine-syms=nuttx-names.dat nuttx.rel
$(Q) $(CC) $(CFLAGS) -Wl,-verbose 2>&1 | \
sed -e '/====/,/====/!d;//d' -e 's/__executable_start/_stext/g' \
sed -e '/====/,/====/!d;//d' \
-e '/__executable_start/s/$$/PROVIDE(_stext = .);/' \
-e 's/^\(\s\+\)\(\.init_array\)/\1\2 : { }\n\1.sinit/g' \
-e 's/^\(\s\+\)\(\.fini_array\)/\1\2 : { }\n\1.einit/g' \
-e 's/__init_array_start/_sinit/g' -e 's/__init_array_end/_einit/g' \

View file

@ -76,6 +76,10 @@ ifeq ($(CONFIG_SIM_GCOV_ALL),y)
ARCHOPTIMIZATION += -fprofile-generate -ftest-coverage
endif
ifneq ($(CONFIG_SCHED_GPROF_ALL)$(CONFIG_SIM_GPROF),)
ARCHOPTIMIZATION += -pg
endif
ifeq ($(CONFIG_SIM_ASAN),y)
ARCHOPTIMIZATION += -fsanitize=address -fsanitize-address-use-after-scope
ARCHOPTIMIZATION += -fsanitize=pointer-compare -fsanitize=pointer-subtract

54
include/sys/gmon.h Normal file
View file

@ -0,0 +1,54 @@
/****************************************************************************
* include/sys/gmon.h
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
#ifndef __INCLUDE_SYS_GMON_H
#define __INCLUDE_SYS_GMON_H
/****************************************************************************
* Public Function Prototypes
****************************************************************************/
#undef EXTERN
#if defined(__cplusplus)
#define EXTERN extern "C"
extern "C"
{
#else
#define EXTERN extern
#endif
/* Start/stop profiling */
void moncontrol(int mode);
/* Set up data structures and start profiling. */
void monstartup(unsigned long lowpc, unsigned long highpc);
/* Clean up profiling and write out gmon.out. */
void _mcleanup(void);
#undef EXTERN
#if defined(__cplusplus)
}
#endif
#endif /* __INCLUDE_SYS_GMON_H */

View file

@ -62,4 +62,8 @@ if(CONFIG_ARCH_SETJMP_H)
endif()
endif()
if(CONFIG_SCHED_GPROF)
list(APPEND SRCS gnu/mcount.S)
endif()
target_sources(c PRIVATE ${SRCS})

View file

@ -60,6 +60,10 @@ ASRCS += arch_setjmp.S
endif
endif
ifeq ($(CONFIG_SCHED_GPROF),y)
ASRCS += mcount.S
endif
ifeq ($(CONFIG_ARCH_TOOLCHAIN_GNU),y)
DEPPATH += --dep-path machine/arm/gnu
VPATH += :machine/arm/gnu

View file

@ -0,0 +1,41 @@
/****************************************************************************
* libs/libc/machine/arm/gnu/mcount.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Public Functions
****************************************************************************/
.globl __gnu_mcount_nc
.syntax unified
.file "mcount.S"
.type __gnu_mcount_nc, %function
__gnu_mcount_nc:
push {r0, r1, r2, r3, lr} /* Save registers */
bic r1, lr, #1 /* R1 contains callee address, with thumb bit cleared */
ldr r0, [sp, #20] /* R0 contains caller address */
bic r0, r0, #1 /* Clear thumb bit */
bl mcount_internal /* Jump to internal _mcount() implementation */
pop {r0, r1, r2, r3, ip, lr} /* Restore saved registers */
bx ip /* Return to callee */
.size __gnu_mcount_nc, .-__gnu_mcount_nc
.end

View file

@ -1357,6 +1357,27 @@ config SCHED_GCOV
"-fprofile-generate -ftest-coverage" compilation parameters
to the file to be analyzed.
config SCHED_GPROF
bool "Enable gprof profiling"
default n
---help---
Enable gprof profiling support. This will cause the compiler to
generate additional code to support profiling. This will also
cause the linker to include the gmon.out file in the final
executable.
Add the "-pg" parameter to the Makefile when compiling to obtain
the function call graph of the specified module.
config SCHED_GPROF_ALL
bool "Enable gprof call graph for all modules"
depends on SCHED_GPROF
default n
---help---
Enable gprof profiling for all code, it will instrument
all code, which will cause a large performance penalty for the code.
You can add the '-pg' parameter to the specified module in the
makefile to only analyze the content of the module.
endmenu
menu "Files and I/O"

View file

@ -26,4 +26,8 @@ if(NOT "${CONFIG_SCHED_STACK_RECORD}" STREQUAL "0")
list(APPEND SRCS stack_record.c)
endif()
if(CONFIG_SCHED_GPROF)
list(APPEND SRCS profile_monitor.c)
endif()
target_sources(sched PRIVATE ${SRCS})

View file

@ -26,6 +26,10 @@ ifneq ($(CONFIG_SCHED_STACK_RECORD),0)
CSRCS += stack_record.c
endif
ifeq ($(CONFIG_SCHED_GPROF),y)
CSRCS += profile_monitor.c
endif
# Include instrument build support
DEPPATH += --dep-path instrument

View file

@ -0,0 +1,584 @@
/****************************************************************************
* sched/instrument/profile_monitor.c
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include <debug.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/gmon.h>
#include <nuttx/arch.h>
#include <nuttx/init.h>
#include <nuttx/kmalloc.h>
#include <nuttx/fs/fs.h>
#include <nuttx/spinlock.h>
/****************************************************************************
* Pre-processor Definitions
****************************************************************************/
#define GMONVERSION 0x00051879
/* Histogram counters are unsigned shorts (according to the kernel). */
#define HISTCOUNTER unsigned short
/* Fraction of text space to allocate for histogram counters here, 1/2 */
#define HISTFRACTION 2
/* Fraction of text space to allocate for from hash buckets.
* The value of HASHFRACTION is based on the minimum number of bytes
* of separation between two subroutine call points in the object code.
* Given MIN_SUBR_SEPARATION bytes of separation the value of
* HASHFRACTION is calculated as:
*
* HASHFRACTION = MIN_SUBR_SEPARATION / (2 * sizeof(short) - 1);
*
* For example, on the VAX, the shortest two call sequence is:
*
* calls $0,(r0)
* calls $0,(r0)
*
* Which is separated by only three bytes, thus HASHFRACTION is
* calculated as:
*
* HASHFRACTION = 3 / (2 * 2 - 1) = 1
*
* Note that the division above rounds down, thus if MIN_SUBR_FRACTION
* is less than three, this algorithm will not work!
*
* In practice, however, call instructions are rarely at a minimal
* distance. Hence, we will define HASHFRACTION to be 2 across all
* architectures. This saves a reasonable amount of space for
* profiling data structures without (in practice) sacrificing
* any granularity.
*/
#define HASHFRACTION 2
/* Percent of text space to allocate for tostructs with a minimum.
* This is a heuristic; we will fail with a warning when profiling
* programs with a very large number of very small functions, but
* that's normally OK.
* 2 is probably still a good value for normal programs.
* Profiling a test case with 64000 small functions will work if
* you raise this value to 3 and link statically (which bloats the
* text size, thus raising the number of arcs expected by the heuristic).
*/
#define ARCDENSITY 3
/* Always allocate at least this many tostructs. This
* hides the inadequacy of the ARCDENSITY heuristic, at least
* for small programs.
*/
#define MINARCS 50
/* The type used to represent indices into gmonparam.tos[]. */
#define ARCINDEX unsigned long
/* Maximum number of arcs we want to allow.
* Used to be max representable value of ARCINDEX minus 2, but now
* that ARCINDEX is a long, that's too large; we don't really want
* to allow a 48 gigabyte table.
*/
#define MAXARCS (1 << 20)
/* General rounding functions. */
#define ROUNDDOWN(x, y) (((x) / (y)) * (y))
#define ROUNDUP(x, y) ((((x) + (y) - 1) / (y)) * (y))
/* See profil(2) where this is described (incorrectly) */
#define SCALE_1_TO_1 0x10000
/****************************************************************************
* Private Types
****************************************************************************/
struct tostruct
{
uintptr_t selfpc; /* Callee address/program counter. The caller address
* is in froms[] array which points to tos[] array
*/
long count; /* How many times it has been called */
ARCINDEX link; /* Link to next entry in hash table. For tos[0] this
* points to the last used entry
*/
};
/* Structure prepended to gmon.out profiling data file. */
struct gmonhdr
{
uintptr_t lpc; /* Base pc address of sample buffer */
uintptr_t hpc; /* Max pc address of sampled buffer */
uint32_t ncnt; /* Size of sample buffer (plus this header) */
uint32_t version; /* Version number */
uint32_t profrate; /* Profiling clock rate */
uint32_t spare[3]; /* Reserved */
};
/* A raw arc, with pointers to the calling site and
* the called site and a count.
*/
struct rawarc
{
uintptr_t raw_frompc;
uintptr_t raw_selfpc;
long raw_count;
};
/* The profiling data structures are housed in this structure. */
struct gmonparam
{
bool running;
FAR unsigned short *kcount; /* Histogram PC sample array */
size_t kcountsize; /* Size of kcount[] array in bytes */
FAR ARCINDEX *froms; /* Array of hashed 'from' addresses. The 16bit
* value is an index into the tos[] array
*/
size_t fromssize; /* Size of froms[] array in bytes */
FAR struct tostruct *tos; /* To struct, contains histogram counter */
size_t tossize; /* Size of tos[] array in bytes */
size_t tolimit;
uintptr_t lowpc; /* Low program counter of area */
uintptr_t highpc; /* High program counter */
size_t textsize; /* Code size */
spinlock_t lock; /* Lock for this structure */
};
/****************************************************************************
* Private Data
****************************************************************************/
static struct gmonparam g_monparam;
/****************************************************************************
* Public Data
****************************************************************************/
extern uint8_t _stext[];
extern uint8_t _etext[];
/****************************************************************************
* Private Functions
****************************************************************************/
noinstrument_function
static int write_gmon(FAR struct gmonparam *p, FAR const char *output)
{
struct gmonhdr gmonhdr;
struct rawarc rawarc;
struct file file;
uintptr_t frompc;
ARCINDEX toindex;
size_t fromindex;
size_t endfrom;
int ret;
ret = file_open(&file, output, O_CREAT | O_TRUNC | O_WRONLY, 0666);
if (ret < 0)
{
serr("cannot open %s\n", output);
return ret;
}
gmonhdr.lpc = p->lowpc;
gmonhdr.hpc = p->highpc;
gmonhdr.ncnt = sizeof(gmonhdr) + p->kcountsize;
gmonhdr.version = GMONVERSION;
gmonhdr.profrate = CONFIG_SCHED_PROFILE_TICKSPERSEC;
ret = file_write(&file, &gmonhdr, sizeof(gmonhdr));
if (ret != sizeof(gmonhdr))
{
serr("write gmonhdr failed\n");
goto out;
}
ret = file_write(&file, p->kcount, p->kcountsize);
if (ret != p->kcountsize)
{
serr("write kcount failed\n");
goto out;
}
endfrom = p->fromssize / sizeof(*p->froms);
for (fromindex = 0; fromindex < endfrom; fromindex++)
{
if (p->froms[fromindex] == 0)
{
continue;
}
frompc = p->lowpc;
frompc += fromindex * HASHFRACTION * sizeof(*p->froms);
for (toindex = p->froms[fromindex]; toindex != 0;
toindex = p->tos[toindex].link)
{
rawarc.raw_frompc = frompc;
rawarc.raw_selfpc = p->tos[toindex].selfpc;
rawarc.raw_count = p->tos[toindex].count;
ret = file_write(&file, &rawarc, sizeof(rawarc));
if (ret != sizeof(rawarc))
{
serr("write rawarc failed\n");
goto out;
}
}
}
out:
file_close(&file);
return ret < 0 ? ret : 0;
}
/****************************************************************************
* Public Functions
****************************************************************************/
/* Control profiling
* profiling is what mcount checks to see if
* all the data structures are ready.
*/
noinstrument_function
void moncontrol(int mode)
{
FAR struct gmonparam *p = &g_monparam;
irqstate_t flags;
if (p->running == !!mode)
{
return;
}
if (mode)
{
uintptr_t lowpc = ROUNDDOWN((uintptr_t)&_stext,
HISTFRACTION * sizeof(HISTCOUNTER));
uintptr_t highpc = ROUNDUP((uintptr_t)&_etext,
HISTFRACTION * sizeof(HISTCOUNTER));
size_t textsize = highpc - lowpc;
size_t kcountsize = ROUNDUP(textsize / HISTFRACTION,
sizeof(*p->kcount));
int scale = kcountsize >= textsize ? SCALE_1_TO_1 :
(float)kcountsize / textsize * SCALE_1_TO_1;
FAR unsigned short *kcount = kmm_zalloc(kcountsize);
if (kcount == NULL)
{
serr("out of memory\n");
return;
}
flags = spin_lock_irqsave(&p->lock);
if (p->kcount)
{
spin_unlock_irqrestore(&p->lock, flags);
kmm_free(kcount);
return;
}
p->running = true;
p->lowpc = lowpc;
p->highpc = highpc;
p->textsize = textsize;
p->kcount = kcount;
p->kcountsize = kcountsize;
spin_unlock_irqrestore(&p->lock, flags);
profil(kcount, kcountsize, lowpc, scale);
}
else
{
bool running;
flags = spin_lock_irqsave(&p->lock);
running = p->running;
p->running = false;
spin_unlock_irqrestore(&p->lock, flags);
if (running)
{
profil(NULL, 0, 0, 0);
}
}
}
noinstrument_function
void monstartup(unsigned long lowpc, unsigned long highpc)
{
FAR struct gmonparam *p = &g_monparam;
irqstate_t flags;
FAR char *buffer;
size_t textsize;
size_t fromssize;
size_t tolimit;
size_t tossize;
/* If we are incorrectly called twice in a row (without an
* intervening call to _mcleanup), ignore the second call to
* prevent leaking memory.
*/
if (p->tos != NULL)
{
return;
}
/* Return if the allocation doesn't allow in the current context */
if (!OSINIT_OS_READY() || up_interrupt_context())
{
return;
}
/* Round lowpc and highpc to multiples of the density we're using
* so the rest of the scaling (here and in gprof) stays in ints.
*/
lowpc = ROUNDDOWN(lowpc, HISTFRACTION * sizeof(HISTCOUNTER));
highpc = ROUNDUP(highpc, HISTFRACTION * sizeof(HISTCOUNTER));
textsize = highpc - lowpc;
fromssize = ROUNDUP(textsize / HASHFRACTION, sizeof(*p->froms));
tolimit = textsize * ARCDENSITY / 100;
if (tolimit < MINARCS)
{
tolimit = MINARCS;
}
else if (tolimit > MAXARCS)
{
tolimit = MAXARCS;
}
tossize = tolimit * sizeof(struct tostruct);
buffer = kmm_zalloc(fromssize + tossize);
if (buffer == NULL)
{
serr("out of memory\n");
return;
}
flags = spin_lock_irqsave(&p->lock);
if (p->tos != NULL)
{
spin_unlock_irqrestore(&p->lock, flags);
kmm_free(buffer);
return;
}
p->lowpc = lowpc;
p->highpc = highpc;
p->textsize = textsize;
p->fromssize = fromssize;
p->tolimit = tolimit;
p->tossize = tossize;
p->tos = (FAR struct tostruct *)buffer;
buffer += p->tossize;
p->froms = (FAR ARCINDEX *)buffer;
spin_unlock_irqrestore(&p->lock, flags);
moncontrol(1);
}
noinstrument_function
void _mcleanup(void)
{
FAR struct gmonparam *p = &g_monparam;
FAR const char *prefix = NULL;
#ifndef CONFIG_DISABLE_ENVIRON
prefix = getenv("GMON_OUT_PREFIX");
#endif
if (prefix == NULL)
{
prefix = "gmon.out";
}
moncontrol(0);
if (p->kcount)
{
write_gmon(p, prefix);
}
kmm_free(p->tos);
kmm_free(p->kcount);
/* Reset buffer to initial state for safety */
memset(p, 0, sizeof(*p));
}
/* mcount_internal is called on entry to each function compiled with
* the profiling switch set by an assembly stub in:
* libs/libc/machine/xxx/mcount.S
* which updates data structures that represent traversals of the
* program's call graph edges. frompc and selfpc are the return
* address and function address that represents the given call graph edge.
*/
noinstrument_function
void mcount_internal(uintptr_t frompc, uintptr_t selfpc)
{
FAR struct gmonparam *p = &g_monparam;
FAR struct tostruct *prevtop;
FAR struct tostruct *top;
FAR ARCINDEX *frompcindex;
ARCINDEX toindex;
irqstate_t flags;
/* Check that we are profiling */
if (!p->running)
{
return;
}
/* Initialize the internal structure if not yet */
monstartup((uintptr_t)&_stext, (uintptr_t)&_etext);
flags = spin_lock_irqsave(&p->lock);
/* Try next time if fail to initialize for some reason */
if (p->tos == NULL)
{
goto done;
}
/* Check that frompc is a reasonable pc value.
* For example: signal catchers get called from the stack,
* not from text space. Too bad.
*/
frompc -= p->lowpc;
if (frompc > p->textsize)
{
goto done;
}
frompcindex = &p->froms[frompc / (HASHFRACTION * sizeof(*p->froms))];
toindex = *frompcindex; /* Get froms[] value */
if (toindex == 0)
{
/* First time traversing this arc */
toindex = ++p->tos[0].link; /* The link of tos[0] points to the last
* used record in the array
*/
if (toindex >= p->tolimit)
{
/* More tos[] entries than we can handle! */
goto done;
}
/* Store new 'to' value into froms[] */
*frompcindex = toindex;
top = &p->tos[toindex];
top->selfpc = selfpc;
top->count = 1;
top->link = 0;
goto done;
}
top = &p->tos[toindex];
if (top->selfpc == selfpc)
{
/* Arc at front of chain; usual case. */
top->count++;
goto done;
}
/* Have to go looking down chain for it.
* Top points to what we are looking at,
* prevtop points to previous top.
* We know it is not at the head of the chain.
*/
for (; ; )
{
if (top->link == 0)
{
/* Top is end of the chain and none of the chain
* had top->selfpc == selfpc.
* So we allocate a new tostruct
* and link it to the head of the chain.
*/
toindex = ++p->tos[0].link;
if (toindex >= p->tolimit)
{
goto done;
}
top = &p->tos[toindex];
top->selfpc = selfpc;
top->count = 1;
top->link = *frompcindex;
*frompcindex = toindex;
goto done;
}
/* Otherwise, check the next arc on the chain. */
prevtop = top;
top = &p->tos[top->link];
if (top->selfpc == selfpc)
{
/* There it is.
* Increment its count
* move it to the head of the chain.
*/
top->count++;
toindex = prevtop->link;
prevtop->link = top->link;
top->link = *frompcindex;
*frompcindex = toindex;
goto done;
}
}
done:
spin_unlock_irqrestore(&p->lock, flags);
}

View file

@ -130,7 +130,7 @@ int profil(FAR unsigned short *buf, size_t bufsiz,
}
memset(buf, 0, bufsiz);
highpc = (uintmax_t)bufsiz * 32768 / scale;
highpc = (uintmax_t)bufsiz * 65536 / scale;
flags = spin_lock_irqsave(&prof->lock);
prof->counter = buf;