From: James Clark <james.clark@linaro•org>
To: Leo Yan <leo.yan@arm•com>
Cc: linux-arm-kernel@lists•infradead.org, coresight@lists•linaro.org,
linux-perf-users@vger•kernel.org,
Arnaldo Carvalho de Melo <acme@kernel•org>,
John Garry <john.g.garry@oracle•com>,
Will Deacon <will@kernel•org>, Mike Leach <mike.leach@arm•com>,
Suzuki K Poulose <suzuki.poulose@arm•com>,
Namhyung Kim <namhyung@kernel•org>,
Mark Rutland <mark.rutland@arm•com>,
Alexander Shishkin <alexander.shishkin@linux•intel.com>,
Jiri Olsa <jolsa@kernel•org>, Ian Rogers <irogers@google•com>,
Adrian Hunter <adrian.hunter@intel•com>,
Al Grant <al.grant@arm•com>,
Paschalis Mpeis <paschalis.mpeis@arm•com>,
Amir Ayupov <aaupov@fb•com>
Subject: Re: [PATCH v6 3/8] perf cs-etm: Use thread-stack for last branch entries
Date: Thu, 4 Jun 2026 15:09:26 +0100 [thread overview]
Message-ID: <9d1e0448-27d7-42d3-aaa3-2d09489f18d9@linaro.org> (raw)
In-Reply-To: <20260526-b4-arm_cs_callchain_support_v1-v6-3-f9f49f53c9dd@arm.com>
On 26/05/2026 5:59 pm, Leo Yan wrote:
> CS ETM maintains its own circular array for last branch entries, with
> local helpers to update, copy and reset the branch stack. This duplicates
> logic already provided by the common code.
>
> Record branch with thread_stack__event() and synthesize branch stack
> with thread_stack__br_sample(). This removes the local last_branch_rb
> buffer and position tracking. Keep the buffer number updated via
> thread_stack__set_trace_nr(), which is used when exporting samples to
> Python scripts.
>
> The output should remain same, except that be->flags.predicted is no
> longer set. Since CoreSight trace does not provide branch prediction
> information, clearing the flag avoids confusion.
>
> Signed-off-by: Leo Yan <leo.yan@arm•com>
> ---
> tools/perf/util/cs-etm.c | 152 +++++++++++++----------------------------------
> 1 file changed, 41 insertions(+), 111 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 5bff8811d61e423463b7bd4e20d599d5b5307a1a..398ab3b7a429d402cc8e5f6cccb35c0b7c253732 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -83,14 +83,13 @@ struct cs_etm_auxtrace {
> struct cs_etm_traceid_queue {
> u8 trace_chan_id;
> u64 period_instructions;
> - size_t last_branch_pos;
> union perf_event *event_buf;
> struct thread *thread;
> struct thread *prev_packet_thread;
> ocsd_ex_level prev_packet_el;
> ocsd_ex_level el;
> + unsigned int br_stack_sz;
> struct branch_stack *last_branch;
> - struct branch_stack *last_branch_rb;
> struct cs_etm_packet *prev_packet;
> struct cs_etm_packet *packet;
> struct cs_etm_packet_queue packet_queue;
> @@ -635,9 +634,8 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> tidq->last_branch = zalloc(sz);
> if (!tidq->last_branch)
> goto out_free;
> - tidq->last_branch_rb = zalloc(sz);
> - if (!tidq->last_branch_rb)
> - goto out_free;
> +
> + tidq->br_stack_sz = etm->synth_opts.last_branch_sz;
> }
>
> tidq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
> @@ -647,7 +645,6 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> return 0;
>
> out_free:
> - zfree(&tidq->last_branch_rb);
> zfree(&tidq->last_branch);
> zfree(&tidq->prev_packet);
> zfree(&tidq->packet);
> @@ -941,7 +938,6 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
> thread__zput(tidq->prev_packet_thread);
> zfree(&tidq->event_buf);
> zfree(&tidq->last_branch);
> - zfree(&tidq->last_branch_rb);
> zfree(&tidq->prev_packet);
> zfree(&tidq->packet);
> zfree(&tidq);
> @@ -1281,57 +1277,6 @@ static int cs_etm__queue_first_cs_timestamp(struct cs_etm_auxtrace *etm,
> return ret;
> }
>
> -static inline
> -void cs_etm__copy_last_branch_rb(struct cs_etm_queue *etmq,
> - struct cs_etm_traceid_queue *tidq)
> -{
> - struct branch_stack *bs_src = tidq->last_branch_rb;
> - struct branch_stack *bs_dst = tidq->last_branch;
> - size_t nr = 0;
> -
> - /*
> - * Set the number of records before early exit: ->nr is used to
> - * determine how many branches to copy from ->entries.
> - */
> - bs_dst->nr = bs_src->nr;
> -
> - /*
> - * Early exit when there is nothing to copy.
> - */
> - if (!bs_src->nr)
> - return;
> -
> - /*
> - * As bs_src->entries is a circular buffer, we need to copy from it in
> - * two steps. First, copy the branches from the most recently inserted
> - * branch ->last_branch_pos until the end of bs_src->entries buffer.
> - */
> - nr = etmq->etm->synth_opts.last_branch_sz - tidq->last_branch_pos;
> - memcpy(&bs_dst->entries[0],
> - &bs_src->entries[tidq->last_branch_pos],
> - sizeof(struct branch_entry) * nr);
> -
> - /*
> - * If we wrapped around at least once, the branches from the beginning
> - * of the bs_src->entries buffer and until the ->last_branch_pos element
> - * are older valid branches: copy them over. The total number of
> - * branches copied over will be equal to the number of branches asked by
> - * the user in last_branch_sz.
> - */
> - if (bs_src->nr >= etmq->etm->synth_opts.last_branch_sz) {
> - memcpy(&bs_dst->entries[nr],
> - &bs_src->entries[0],
> - sizeof(struct branch_entry) * tidq->last_branch_pos);
> - }
> -}
> -
> -static inline
> -void cs_etm__reset_last_branch_rb(struct cs_etm_traceid_queue *tidq)
> -{
> - tidq->last_branch_pos = 0;
> - tidq->last_branch_rb->nr = 0;
> -}
> -
> static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
> u8 trace_chan_id, u64 addr)
> {
> @@ -1400,38 +1345,6 @@ static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq,
> return addr;
> }
>
> -static void cs_etm__update_last_branch_rb(struct cs_etm_queue *etmq,
> - struct cs_etm_traceid_queue *tidq)
> -{
> - struct branch_stack *bs = tidq->last_branch_rb;
> - struct branch_entry *be;
> -
> - /*
> - * The branches are recorded in a circular buffer in reverse
> - * chronological order: we start recording from the last element of the
> - * buffer down. After writing the first element of the stack, move the
> - * insert position back to the end of the buffer.
> - */
> - if (!tidq->last_branch_pos)
> - tidq->last_branch_pos = etmq->etm->synth_opts.last_branch_sz;
> -
> - tidq->last_branch_pos -= 1;
> -
> - be = &bs->entries[tidq->last_branch_pos];
> - be->from = cs_etm__last_executed_instr(tidq->prev_packet);
> - be->to = cs_etm__first_executed_instr(tidq->packet);
> - /* No support for mispredict */
> - be->flags.mispred = 0;
> - be->flags.predicted = 1;
> -
> - /*
> - * Increment bs->nr until reaching the number of last branches asked by
> - * the user on the command line.
> - */
> - if (bs->nr < etmq->etm->synth_opts.last_branch_sz)
> - bs->nr += 1;
> -}
> -
> static int cs_etm__inject_event(struct cs_etm_auxtrace *etm, union perf_event *event,
> struct perf_sample *sample, u64 type)
> {
> @@ -1579,6 +1492,37 @@ static inline u64 cs_etm__resolve_sample_time(struct cs_etm_queue *etmq,
> return etm->latest_kernel_timestamp;
> }
>
> +static void cs_etm__add_stack_event(struct cs_etm_queue *etmq,
> + struct cs_etm_traceid_queue *tidq)
> +{
> + u64 from, to;
> + int size;
> +
> + if (!tidq->prev_packet->last_instr_taken_branch)
> + return;
> +
> + if (tidq->prev_packet->sample_type != CS_ETM_RANGE ||
> + tidq->packet->sample_type != CS_ETM_RANGE)
> + return;
> +
> + if (etmq->etm->synth_opts.last_branch) {
> + from = cs_etm__last_executed_instr(tidq->prev_packet);
> + to = cs_etm__first_executed_instr(tidq->packet);
> +
> + size = cs_etm__instr_size(etmq, tidq->trace_chan_id,
> + tidq->prev_packet->isa, from);
> +
> + /* Enable callchain so thread stack entry can be allocated */
> + thread_stack__event(tidq->thread, tidq->prev_packet->cpu,
> + tidq->prev_packet->flags, from, to, size,
> + etmq->buffer->buffer_nr + 1, true,
> + tidq->br_stack_sz, 0);
> + } else {
> + thread_stack__set_trace_nr(tidq->thread, tidq->prev_packet->cpu,
> + etmq->buffer->buffer_nr + 1);
> + }
> +}
> +
> static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> struct cs_etm_traceid_queue *tidq,
> u64 addr, u64 period)
> @@ -1608,8 +1552,12 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
>
> cs_etm__copy_insn(etmq, tidq->trace_chan_id, tidq->packet, &sample);
>
> - if (etm->synth_opts.last_branch)
> + if (etm->synth_opts.last_branch) {
> + thread_stack__br_sample(tidq->thread, tidq->packet->cpu,
> + tidq->last_branch,
> + tidq->br_stack_sz);
> sample.branch_stack = tidq->last_branch;
> + }
>
> if (etm->synth_opts.inject) {
> ret = cs_etm__inject_event(etm, event, &sample,
> @@ -1798,14 +1746,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
>
> tidq->period_instructions += tidq->packet->instr_count;
>
> - /*
> - * Record a branch when the last instruction in
> - * PREV_PACKET is a branch.
> - */
> - if (etm->synth_opts.last_branch &&
> - tidq->prev_packet->sample_type == CS_ETM_RANGE &&
> - tidq->prev_packet->last_instr_taken_branch)
> - cs_etm__update_last_branch_rb(etmq, tidq);
> + cs_etm__add_stack_event(etmq, tidq);
Would it be cleaner to call this whenever a branch sample is generated?
Seems like the conditions for calling thread_stack__event() and
cs_etm__synth_branch_sample() are slightly different (ignoring the fact
that branches are only generated when the user asks for them).
Maybe the conditions should be different, but maybe a comment why or if
they're the same, a shared function for the conditions would help.
For example, we don't push a branch to the stack for
CS_ETM_DISCONTINUITY, but we do generate a branch sample from 0.
>
> if (etm->synth_opts.instructions &&
> tidq->period_instructions >= etm->instructions_sample_period) {
> @@ -1864,10 +1805,6 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
> u64 offset = etm->instructions_sample_period - instrs_prev;
> u64 addr;
>
> - /* Prepare last branches for instruction sample */
> - if (etm->synth_opts.last_branch)
> - cs_etm__copy_last_branch_rb(etmq, tidq);
> -
> while (tidq->period_instructions >=
> etm->instructions_sample_period) {
> /*
> @@ -1947,10 +1884,6 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
> etmq->etm->synth_opts.instructions &&
> tidq->prev_packet->sample_type == CS_ETM_RANGE) {
> u64 addr;
> -
> - /* Prepare last branches for instruction sample */
> - cs_etm__copy_last_branch_rb(etmq, tidq);
> -
> /*
> * Generate a last branch event for the branches left in the
> * circular buffer at the end of the trace.
> @@ -1982,7 +1915,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
>
> /* Reset last branches after flush the trace */
> if (etm->synth_opts.last_branch)
> - cs_etm__reset_last_branch_rb(tidq);
> + thread_stack__flush(tidq->thread);
>
> return err;
> }
> @@ -2006,9 +1939,6 @@ static int cs_etm__end_block(struct cs_etm_queue *etmq,
> tidq->prev_packet->sample_type == CS_ETM_RANGE) {
> u64 addr;
>
> - /* Prepare last branches for instruction sample */
> - cs_etm__copy_last_branch_rb(etmq, tidq);
> -
> /*
> * Use the address of the end of the last reported execution
> * range.
>
next prev parent reply other threads:[~2026-06-04 14:09 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-26 16:59 [PATCH v6 0/8] perf cs-etm: Support thread stack and callchain Leo Yan
2026-05-26 16:59 ` [PATCH v6 1/8] perf cs-etm: Decode ETE exception packets Leo Yan
2026-06-04 14:10 ` James Clark
2026-05-26 16:59 ` [PATCH v6 2/8] perf cs-etm: Refactor instruction size handling Leo Yan
2026-06-04 14:11 ` James Clark
2026-05-26 16:59 ` [PATCH v6 3/8] perf cs-etm: Use thread-stack for last branch entries Leo Yan
2026-06-04 14:09 ` James Clark [this message]
2026-05-26 16:59 ` [PATCH v6 4/8] perf cs-etm: Flush thread stacks after decoder reset Leo Yan
2026-06-04 14:12 ` James Clark
2026-05-26 16:59 ` [PATCH v6 5/8] perf cs-etm: Support call indentation Leo Yan
2026-06-04 14:24 ` James Clark
2026-05-26 16:59 ` [PATCH v6 6/8] perf cs-etm: Filter synthesized branch samples Leo Yan
2026-06-04 14:42 ` James Clark
2026-05-26 16:59 ` [PATCH v6 7/8] perf cs-etm: Synthesize callchains for instruction samples Leo Yan
2026-06-04 15:07 ` James Clark
2026-05-26 16:59 ` [PATCH v6 8/8] perf test: Add Arm CoreSight callchain test Leo Yan
2026-05-29 14:57 ` [PATCH v6 0/8] perf cs-etm: Support thread stack and callchain Arnaldo Carvalho de Melo
2026-06-01 11:03 ` Leo Yan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=9d1e0448-27d7-42d3-aaa3-2d09489f18d9@linaro.org \
--to=james.clark@linaro$(echo .)org \
--cc=aaupov@fb$(echo .)com \
--cc=acme@kernel$(echo .)org \
--cc=adrian.hunter@intel$(echo .)com \
--cc=al.grant@arm$(echo .)com \
--cc=alexander.shishkin@linux$(echo .)intel.com \
--cc=coresight@lists$(echo .)linaro.org \
--cc=irogers@google$(echo .)com \
--cc=john.g.garry@oracle$(echo .)com \
--cc=jolsa@kernel$(echo .)org \
--cc=leo.yan@arm$(echo .)com \
--cc=linux-arm-kernel@lists$(echo .)infradead.org \
--cc=linux-perf-users@vger$(echo .)kernel.org \
--cc=mark.rutland@arm$(echo .)com \
--cc=mike.leach@arm$(echo .)com \
--cc=namhyung@kernel$(echo .)org \
--cc=paschalis.mpeis@arm$(echo .)com \
--cc=suzuki.poulose@arm$(echo .)com \
--cc=will@kernel$(echo .)org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox