From: Alistair Popple <alistair@popple•id.au>
To: Mark Hairgrove <mhairgrove@nvidia•com>
Cc: linuxppc-dev@lists•ozlabs.org, Reza Arbab <arbab@us•ibm.com>
Subject: Re: [PATCH 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates
Date: Tue, 02 Oct 2018 17:23:47 +1000 [thread overview]
Message-ID: <1590969.EuJDlkCUV7@new-mexico> (raw)
In-Reply-To: <1538090591-28519-2-git-send-email-mhairgrove@nvidia.com>
Thanks, this looks good to me.
Reviewed-by: Alistair Popple <alistair@popple•id.au>
On Thursday, 27 September 2018 4:23:09 PM AEST Mark Hairgrove wrote:
> There are two types of ATSDs issued to the NPU: invalidates targeting a
> specific virtual address and invalidates targeting the whole address
> space. In both cases prior to this change, the sequence was:
>
> for each NPU
> - Write the target address to the XTS_ATSD_AVA register
> - EIEIO
> - Write the launch value to issue the ATSD
>
> First, a target address is not required when invalidating the whole
> address space, so that write and the EIEIO have been removed. The AP
> (size) field in the launch is not needed either.
>
> Second, for per-address invalidates the above sequence is inefficient in
> the common case of multiple NPUs because an EIEIO is issued per NPU. This
> unnecessarily forces the launches of later ATSDs to be ordered with the
> launches of earlier ones. The new sequence only issues a single EIEIO:
>
> for each NPU
> - Write the target address to the XTS_ATSD_AVA register
> EIEIO
> for each NPU
> - Write the launch value to issue the ATSD
>
> Performance results were gathered using a microbenchmark which creates a
> 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in
> strides across the allocation.
>
> With only a single NPU active (one GPU) the difference is in the noise for
> both types of invalidates (+/-1%).
>
> With two NPUs active (on a 6-GPU system) the effect is more noticeable:
>
> mprotect rate (GB/s)
> Stride Before After Speedup
> 64K 5.9 6.5 10%
> 1M 31.2 33.4 7%
> 2M 36.3 38.7 7%
> 4M 322.6 356.7 11%
>
> Signed-off-by: Mark Hairgrove <mhairgrove@nvidia•com>
> ---
> arch/powerpc/platforms/powernv/npu-dma.c | 99 ++++++++++++++---------------
> 1 files changed, 48 insertions(+), 51 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index 8006c54..c8f438a 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
> }
>
> /* MMIO ATSD register offsets */
> -#define XTS_ATSD_AVA 1
> -#define XTS_ATSD_STAT 2
> +#define XTS_ATSD_LAUNCH 0
> +#define XTS_ATSD_AVA 1
> +#define XTS_ATSD_STAT 2
>
> -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
> - unsigned long launch, unsigned long va)
> +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
> + bool flush)
> {
> - struct npu *npu = mmio_atsd_reg->npu;
> - int reg = mmio_atsd_reg->reg;
> + unsigned long launch = 0;
>
> - __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
> - eieio();
> - __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
> + if (psize == MMU_PAGE_COUNT) {
> + /* IS set to invalidate entire matching PID */
> + launch |= PPC_BIT(12);
> + } else {
> + /* AP set to invalidate region of psize */
> + launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
> + }
> +
> + /* PRS set to process-scoped */
> + launch |= PPC_BIT(13);
> +
> + /* PID */
> + launch |= pid << PPC_BITLSHIFT(38);
> +
> + /* No flush */
> + launch |= !flush << PPC_BITLSHIFT(39);
> +
> + return launch;
> }
>
> -static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> - unsigned long pid, bool flush)
> +static void mmio_atsd_regs_write(struct mmio_atsd_reg
> + mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
> + unsigned long val)
> {
> - int i;
> - unsigned long launch;
> + struct npu *npu;
> + int i, reg;
>
> for (i = 0; i <= max_npu2_index; i++) {
> - if (mmio_atsd_reg[i].reg < 0)
> + reg = mmio_atsd_reg[i].reg;
> + if (reg < 0)
> continue;
>
> - /* IS set to invalidate matching PID */
> - launch = PPC_BIT(12);
> -
> - /* PRS set to process-scoped */
> - launch |= PPC_BIT(13);
> -
> - /* AP */
> - launch |= (u64)
> - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> -
> - /* PID */
> - launch |= pid << PPC_BITLSHIFT(38);
> + npu = mmio_atsd_reg[i].npu;
> + __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
> + }
> +}
>
> - /* No flush */
> - launch |= !flush << PPC_BITLSHIFT(39);
> +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> + unsigned long pid, bool flush)
> +{
> + unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
>
> - /* Invalidating the entire process doesn't use a va */
> - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
> - }
> + /* Invalidating the entire process doesn't use a va */
> + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
> }
>
> static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> unsigned long va, unsigned long pid, bool flush)
> {
> - int i;
> unsigned long launch;
>
> - for (i = 0; i <= max_npu2_index; i++) {
> - if (mmio_atsd_reg[i].reg < 0)
> - continue;
> -
> - /* IS set to invalidate target VA */
> - launch = 0;
> + launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
>
> - /* PRS set to process scoped */
> - launch |= PPC_BIT(13);
> + /* Write all VAs first */
> + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
>
> - /* AP */
> - launch |= (u64)
> - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
> -
> - /* PID */
> - launch |= pid << PPC_BITLSHIFT(38);
> -
> - /* No flush */
> - launch |= !flush << PPC_BITLSHIFT(39);
> + /* Issue one barrier for all address writes */
> + eieio();
>
> - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
> - }
> + /* Launch */
> + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
> }
>
> #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
>
next prev parent reply other threads:[~2018-10-02 7:27 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-09-27 23:23 [PATCH 0/3] powerpc/powernv/npu: Improve ATSD invalidation overhead Mark Hairgrove
2018-09-27 23:23 ` [PATCH 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates Mark Hairgrove
2018-10-02 7:23 ` Alistair Popple [this message]
2018-09-27 23:23 ` [PATCH 2/3] powerpc/powernv/npu: Use size-based " Mark Hairgrove
2018-10-02 7:11 ` Alistair Popple
2018-10-03 1:10 ` Mark Hairgrove
2018-10-03 2:27 ` Alistair Popple
2018-10-03 18:33 ` Mark Hairgrove
2018-09-27 23:23 ` [PATCH 3/3] powerpc/powernv/npu: Remove atsd_threshold debugfs setting Mark Hairgrove
2018-10-02 7:12 ` Alistair Popple
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1590969.EuJDlkCUV7@new-mexico \
--to=alistair@popple$(echo .)id.au \
--cc=arbab@us$(echo .)ibm.com \
--cc=linuxppc-dev@lists$(echo .)ozlabs.org \
--cc=mhairgrove@nvidia$(echo .)com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox