public inbox for netdev@vger.kernel.org 
 help / color / mirror / Atom feed
From: Jason Gunthorpe <jgg@nvidia•com>
To: Leon Romanovsky <leon@kernel•org>
Cc: Patrisious Haddad <phaddad@nvidia•com>,
	"David S. Miller" <davem@davemloft•net>,
	Eric Dumazet <edumazet@google•com>,
	Jakub Kicinski <kuba@kernel•org>,
	linux-rdma@vger•kernel.org, netdev@vger•kernel.org,
	Paolo Abeni <pabeni@redhat•com>,
	Saeed Mahameed <saeedm@nvidia•com>
Subject: Re: [PATCH rdma-next v1 2/3] RDMA/mlx5: Handling dct common resource destruction upon firmware failure
Date: Tue, 21 Mar 2023 09:37:24 -0300	[thread overview]
Message-ID: <ZBmlBEldcG6rMcM1@nvidia.com> (raw)
In-Reply-To: <20230321120259.GT36557@unreal>

On Tue, Mar 21, 2023 at 02:02:59PM +0200, Leon Romanovsky wrote:
> On Tue, Mar 21, 2023 at 08:53:35AM -0300, Jason Gunthorpe wrote:
> > On Tue, Mar 21, 2023 at 09:54:58AM +0200, Leon Romanovsky wrote:
> > > On Mon, Mar 20, 2023 at 04:18:14PM -0300, Jason Gunthorpe wrote:
> > > > On Thu, Mar 16, 2023 at 03:39:27PM +0200, Leon Romanovsky wrote:
> > > > > From: Patrisious Haddad <phaddad@nvidia•com>
> > > > > 
> > > > > Previously when destroying a DCT, if the firmware function for the
> > > > > destruction failed, the common resource would have been destroyed
> > > > > either way, since it was destroyed before the firmware object.
> > > > > Which leads to kernel warning "refcount_t: underflow" which indicates
> > > > > possible use-after-free.
> > > > > Which is triggered when we try to destroy the common resource for the
> > > > > second time and execute refcount_dec_and_test(&common->refcount).
> > > > > 
> > > > > So, currently before destroying the common resource we check its
> > > > > refcount and continue with the destruction only if it isn't zero.
> > > > 
> > > > This seems super sketchy
> > > > 
> > > > If the destruction fails why not set the refcount back to 1?
> > > 
> > > Because destruction will fail in destroy_rq_tracked() which is after
> > > destroy_resource_common().
> > > 
> > > In first destruction attempt, we delete qp from radix tree and wait for all
> > > reference to drop. In order do not undo all this logic (setting 1 alone is
> > > not enough), it is much safer simply skip destroy_resource_common() in reentry
> > > case.
> > 
> > This is the bug I pointed a long time ago, it is ordered wrong to
> > remove restrack before destruction is assured
> 
> It is not restrack, but internal to mlx5_core structure.
> 
>   176 static void destroy_resource_common(struct mlx5_ib_dev *dev,
>   177                                     struct mlx5_core_qp *qp)
>   178 {
>   179         struct mlx5_qp_table *table = &dev->qp_table;
>   180         unsigned long flags;
>   181
> 
> ....
> 
>   185         spin_lock_irqsave(&table->lock, flags);
>   186         radix_tree_delete(&table->tree,
>   187                           qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
>   188         spin_unlock_irqrestore(&table->lock, flags);
>   189         mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
>   190         wait_for_completion(&qp->common.free);
>   191 }

Same basic issue.

"RSC"'s refcount stuff is really only for ODP to use, and the silly
pseudo locking should really just be rwsem not a refcount.

Get DCT out of that particular mess and the scheme is quite simple and
doesn't nee hacky stuff.

Please make a patch to remove radix tree from this code too...

diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
index bae0334d6e7f18..68009bff4bd544 100644
--- a/drivers/infiniband/hw/mlx5/qpc.c
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -88,23 +88,34 @@ static bool is_event_type_allowed(int rsc_type, int event_type)
 	}
 }
 
+static int dct_event_notifier(struct mlx5_ib_dev *dev, struct mlx5_eqe *eqe)
+{
+	struct mlx5_core_dct *dct;
+	u32 qpn;
+
+	qpn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+	xa_lock(&dev->qp_table.dct_xa);
+	dct = xa_load(&dev->qp_table.dct_xa, qpn);
+	if (dct)
+		complete(&dct->drained);
+	xa_unlock(&dev->qp_table.dct_xa);
+	return NOTIFY_OK;
+}
+
 static int rsc_event_notifier(struct notifier_block *nb,
 			      unsigned long type, void *data)
 {
+	struct mlx5_ib_dev *dev =
+		container_of(nb, struct mlx5_ib_dev, qp_table.nb);
 	struct mlx5_core_rsc_common *common;
-	struct mlx5_qp_table *table;
-	struct mlx5_core_dct *dct;
+	struct mlx5_eqe *eqe = data;
 	u8 event_type = (u8)type;
 	struct mlx5_core_qp *qp;
-	struct mlx5_eqe *eqe;
 	u32 rsn;
 
 	switch (event_type) {
 	case MLX5_EVENT_TYPE_DCT_DRAINED:
-		eqe = data;
-		rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
-		rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN);
-		break;
+		return dct_event_notifier(dev, eqe);
 	case MLX5_EVENT_TYPE_PATH_MIG:
 	case MLX5_EVENT_TYPE_COMM_EST:
 	case MLX5_EVENT_TYPE_SQ_DRAINED:
@@ -113,7 +124,6 @@ static int rsc_event_notifier(struct notifier_block *nb,
 	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
 	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
 	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
-		eqe = data;
 		rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
 		rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
 		break;
@@ -121,8 +131,7 @@ static int rsc_event_notifier(struct notifier_block *nb,
 		return NOTIFY_DONE;
 	}
 
-	table = container_of(nb, struct mlx5_qp_table, nb);
-	common = mlx5_get_rsc(table, rsn);
+	common = mlx5_get_rsc(&dev->qp_table, rsn);
 	if (!common)
 		return NOTIFY_OK;
 
@@ -137,11 +146,6 @@ static int rsc_event_notifier(struct notifier_block *nb,
 		qp->event(qp, event_type);
 		/* Need to put resource in event handler */
 		return NOTIFY_OK;
-	case MLX5_RES_DCT:
-		dct = (struct mlx5_core_dct *)common;
-		if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
-			complete(&dct->drained);
-		break;
 	default:
 		break;
 	}
@@ -188,7 +192,7 @@ static void destroy_resource_common(struct mlx5_ib_dev *dev,
 }
 
 static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
-				  struct mlx5_core_dct *dct, bool need_cleanup)
+				  struct mlx5_core_dct *dct)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {};
 	struct mlx5_core_qp *qp = &dct->mqp;
@@ -203,13 +207,14 @@ static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
 	}
 	wait_for_completion(&dct->drained);
 destroy:
-	if (need_cleanup)
-		destroy_resource_common(dev, &dct->mqp);
 	MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
 	MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
 	MLX5_SET(destroy_dct_in, in, uid, qp->uid);
 	err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in);
-	return err;
+	if (err)
+		return err;
+	xa_cmpxchg(&dev->qp_table.dct_xa, dct->mqp.qpn, dct, NULL, GFP_KERNEL);
+	return 0;
 }
 
 int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
@@ -227,13 +232,13 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
 
 	qp->qpn = MLX5_GET(create_dct_out, out, dctn);
 	qp->uid = MLX5_GET(create_dct_in, in, uid);
-	err = create_resource_common(dev, qp, MLX5_RES_DCT);
+	err = xa_err(xa_store(&dev->qp_table.dct_xa, qp->qpn, dct, GFP_KERNEL));
 	if (err)
 		goto err_cmd;
 
 	return 0;
 err_cmd:
-	_mlx5_core_destroy_dct(dev, dct, false);
+	_mlx5_core_destroy_dct(dev, dct);
 	return err;
 }
 
@@ -284,7 +289,7 @@ static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev,
 int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
 			  struct mlx5_core_dct *dct)
 {
-	return _mlx5_core_destroy_dct(dev, dct, true);
+	return _mlx5_core_destroy_dct(dev, dct);
 }
 
 int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp)
@@ -488,6 +493,7 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev)
 
 	spin_lock_init(&table->lock);
 	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	xa_init(&table->dct_xa);
 	mlx5_qp_debugfs_init(dev->mdev);
 
 	table->nb.notifier_call = rsc_event_notifier;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209e4..87e19e6d07a94a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -381,7 +381,6 @@ enum mlx5_res_type {
 	MLX5_RES_SRQ	= 3,
 	MLX5_RES_XSRQ	= 4,
 	MLX5_RES_XRQ	= 5,
-	MLX5_RES_DCT	= MLX5_EVENT_QUEUE_TYPE_DCT,
 };
 
 struct mlx5_core_rsc_common {
@@ -443,6 +442,7 @@ struct mlx5_core_health {
 
 struct mlx5_qp_table {
 	struct notifier_block   nb;
+	struct xarray dct_xa;
 
 	/* protect radix tree
 	 */

  reply	other threads:[~2023-03-21 12:37 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-16 13:39 [PATCH rdma-next v1 0/3] Handle FW failures to destroy QP/RQ objects Leon Romanovsky
2023-03-16 13:39 ` [PATCH mlx5-next v1 1/3] net/mlx5: Nullify qp->dbg pointer post destruction Leon Romanovsky
2023-03-16 13:39 ` [PATCH rdma-next v1 2/3] RDMA/mlx5: Handling dct common resource destruction upon firmware failure Leon Romanovsky
2023-03-20 19:18   ` Jason Gunthorpe
2023-03-21  7:54     ` Leon Romanovsky
2023-03-21 11:53       ` Jason Gunthorpe
2023-03-21 12:02         ` Leon Romanovsky
2023-03-21 12:37           ` Jason Gunthorpe [this message]
2023-03-21 12:43             ` Leon Romanovsky
2023-03-16 13:39 ` [PATCH rdma-next v1 3/3] RDMA/mlx5: Return the firmware result upon destroying QP/RQ Leon Romanovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZBmlBEldcG6rMcM1@nvidia.com \
    --to=jgg@nvidia$(echo .)com \
    --cc=davem@davemloft$(echo .)net \
    --cc=edumazet@google$(echo .)com \
    --cc=kuba@kernel$(echo .)org \
    --cc=leon@kernel$(echo .)org \
    --cc=linux-rdma@vger$(echo .)kernel.org \
    --cc=netdev@vger$(echo .)kernel.org \
    --cc=pabeni@redhat$(echo .)com \
    --cc=phaddad@nvidia$(echo .)com \
    --cc=saeedm@nvidia$(echo .)com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox