public inbox for netdev@vger.kernel.org 
 help / color / mirror / Atom feed
From: "Toke Høiland-Jørgensen" <toke@redhat•com>
To: "Jonas Köppeler" <j.koeppeler@tu-berlin•de>,
	"Eric Dumazet" <edumazet@google•com>
Cc: "David S . Miller" <davem@davemloft•net>,
	Jakub Kicinski <kuba@kernel•org>, Paolo Abeni <pabeni@redhat•com>,
	Simon Horman <horms@kernel•org>,
	Jamal Hadi Salim <jhs@mojatatu•com>,
	Cong Wang <xiyou.wangcong@gmail•com>,
	Jiri Pirko <jiri@resnulli•us>,
	Kuniyuki Iwashima <kuniyu@google•com>,
	Willem de Bruijn <willemb@google•com>,
	netdev@vger•kernel.org, eric.dumazet@gmail•com
Subject: Re: [PATCH v1 net-next 5/5] net: dev_queue_xmit() llist adoption
Date: Tue, 11 Nov 2025 17:42:07 +0100	[thread overview]
Message-ID: <87ms4sldwg.fsf@toke.dk> (raw)
In-Reply-To: <99ba3114-10a3-4df2-b49e-63ce8687836d@tu-berlin.de>

Jonas Köppeler <j.koeppeler@tu-berlin•de> writes:

> On 11/10/25 18:34, Eric Dumazet wrote:
>> On Mon, Nov 10, 2025 at 6:49 AM Toke Høiland-Jørgensen <toke@redhat•com> wrote:
>>> Eric Dumazet <edumazet@google•com> writes:
>>>
>>>> I can work on a patch today.
>>> This sounds like an excellent idea in any case - thanks! :)
>> The following (on top of my last series) seems to work for me
>>
>>   include/net/pkt_sched.h   |    5 +++--
>>   include/net/sch_generic.h |   24 +++++++++++++++++++++++-
>>   net/core/dev.c            |   33 +++++++++++++++++++--------------
>>   net/sched/sch_cake.c      |    4 +++-
>>   4 files changed, 48 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
>> index 4678db45832a1e3bf7b8a07756fb89ab868bd5d2..e703c507d0daa97ae7c3bf131e322b1eafcc5664
>> 100644
>> --- a/include/net/pkt_sched.h
>> +++ b/include/net/pkt_sched.h
>> @@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
>>
>>   void __qdisc_run(struct Qdisc *q);
>>
>> -static inline void qdisc_run(struct Qdisc *q)
>> +static inline struct sk_buff *qdisc_run(struct Qdisc *q)
>>   {
>>          if (qdisc_run_begin(q)) {
>>                  __qdisc_run(q);
>> -               qdisc_run_end(q);
>> +               return qdisc_run_end(q);
>>          }
>> +       return NULL;
>>   }
>>
>>   extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 79501499dafba56271b9ebd97a8f379ffdc83cac..19cd2bc13bdba48f941b1599f896c15c8c7860ae
>> 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -88,6 +88,8 @@ struct Qdisc {
>>   #define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
>>   #define TCQ_F_NOLOCK           0x100 /* qdisc does not require locking */
>>   #define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
>> +#define TCQ_F_DEQUEUE_DROPS    0x400 /* ->dequeue() can drop packets
>> in q->to_free */
>> +
>>          u32                     limit;
>>          const struct Qdisc_ops  *ops;
>>          struct qdisc_size_table __rcu *stab;
>> @@ -119,6 +121,8 @@ struct Qdisc {
>>
>>                  /* Note : we only change qstats.backlog in fast path. */
>>                  struct gnet_stats_queue qstats;
>> +
>> +               struct sk_buff          *to_free;
>>          __cacheline_group_end(Qdisc_write);
>>
>>
>> @@ -218,8 +222,15 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
>>          return true;
>>   }
>>
>> -static inline void qdisc_run_end(struct Qdisc *qdisc)
>> +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
>>   {
>> +       struct sk_buff *to_free = NULL;
>> +
>> +       if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
>> +               to_free = qdisc->to_free;
>> +               if (to_free)
>> +                       qdisc->to_free = NULL;
>> +       }
>>          if (qdisc->flags & TCQ_F_NOLOCK) {
>>                  spin_unlock(&qdisc->seqlock);
>>
>> @@ -235,6 +246,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
>>          } else {
>>                  WRITE_ONCE(qdisc->running, false);
>>          }
>> +       return to_free;
>>   }
>>
>>   static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
>> @@ -1105,6 +1117,16 @@ static inline void tcf_set_drop_reason(const
>> struct sk_buff *skb,
>>          tc_skb_cb(skb)->drop_reason = reason;
>>   }
>>
>> +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
>> +                                     enum skb_drop_reason reason)
>> +{
>> +       DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
>> +
>> +       tcf_set_drop_reason(skb, reason);
>> +       skb->next = q->to_free;
>> +       q->to_free = skb;
>> +}
>> +
>>   /* Instead of calling kfree_skb() while root qdisc lock is held,
>>    * queue the skb for future freeing at end of __dev_xmit_skb()
>>    */
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index ac994974e2a81889fcc0a2e664edcdb7cfd0496d..18cfcd765b1b3e4af1c5339e36df517e7abc914f
>> 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff
>> *skb, struct Qdisc *q,
>>                                   struct net_device *dev,
>>                                   struct netdev_queue *txq)
>>   {
>> -       struct sk_buff *next, *to_free = NULL;
>> +       struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
>>          spinlock_t *root_lock = qdisc_lock(q);
>>          struct llist_node *ll_list, *first_n;
>>          unsigned long defer_count = 0;
>> @@ -4160,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff
>> *skb, struct Qdisc *q,
>>                          if (unlikely(!nolock_qdisc_is_empty(q))) {
>>                                  rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
>>                                  __qdisc_run(q);
>> -                               qdisc_run_end(q);
>> +                               to_free2 = qdisc_run_end(q);
>>
>> -                               goto no_lock_out;
>> +                               goto free_out;
>>                          }
>>
>>                          qdisc_bstats_cpu_update(q, skb);
>> @@ -4170,18 +4170,15 @@ static inline int __dev_xmit_skb(struct
>> sk_buff *skb, struct Qdisc *q,
>>                              !nolock_qdisc_is_empty(q))
>>                                  __qdisc_run(q);
>>
>> -                       qdisc_run_end(q);
>> -                       return NET_XMIT_SUCCESS;
>> +                       to_free2 = qdisc_run_end(q);
>> +                       rc = NET_XMIT_SUCCESS;
>> +                       goto free_out;
>>                  }
>>
>>                  rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
>> -               qdisc_run(q);
>> +               to_free2 = qdisc_run(q);
>>
>> -no_lock_out:
>> -               if (unlikely(to_free))
>> -                       kfree_skb_list_reason(to_free,
>> -                                             tcf_get_drop_reason(to_free));
>> -               return rc;
>> +               goto free_out;
>>          }
>>
>>          /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
>> @@ -4239,7 +4236,7 @@ static inline int __dev_xmit_skb(struct sk_buff
>> *skb, struct Qdisc *q,
>>                  qdisc_bstats_update(q, skb);
>>                  if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
>>                          __qdisc_run(q);
>> -               qdisc_run_end(q);
>> +               to_free2 = qdisc_run_end(q);
>>                  rc = NET_XMIT_SUCCESS;
>>          } else {
>>                  int count = 0;
>> @@ -4251,15 +4248,19 @@ static inline int __dev_xmit_skb(struct
>> sk_buff *skb, struct Qdisc *q,
>>                          rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
>>                          count++;
>>                  }
>> -               qdisc_run(q);
>> +               to_free2 = qdisc_run(q);
>>                  if (count != 1)
>>                          rc = NET_XMIT_SUCCESS;
>>          }
>>   unlock:
>>          spin_unlock(root_lock);
>> +free_out:
>>          if (unlikely(to_free))
>>                  kfree_skb_list_reason(to_free,
>>                                        tcf_get_drop_reason(to_free));
>> +       if (unlikely(to_free2))
>> +               kfree_skb_list_reason(to_free2,
>> +                                     tcf_get_drop_reason(to_free2));
>>          return rc;
>>   }
>>
>> @@ -5741,6 +5742,7 @@ static __latent_entropy void net_tx_action(void)
>>          }
>>
>>          if (sd->output_queue) {
>> +               struct sk_buff *to_free;
>>                  struct Qdisc *head;
>>
>>                  local_irq_disable();
>> @@ -5780,9 +5782,12 @@ static __latent_entropy void net_tx_action(void)
>>                          }
>>
>>                          clear_bit(__QDISC_STATE_SCHED, &q->state);
>> -                       qdisc_run(q);
>> +                       to_free = qdisc_run(q);
>>                          if (root_lock)
>>                                  spin_unlock(root_lock);
>> +                       if (unlikely(to_free))
>> +                               kfree_skb_list_reason(to_free,
>> +                                             tcf_get_drop_reason(to_free));
>>                  }
>>
>>                  rcu_read_unlock();
>> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
>> index 312f5b000ffb67d74faf70f26d808e26315b4ab8..a717cc4e0606e80123ec9c76331d454dad699b69
>> 100644
>> --- a/net/sched/sch_cake.c
>> +++ b/net/sched/sch_cake.c
>> @@ -2183,7 +2183,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
>>                  b->tin_dropped++;
>>                  qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
>>                  qdisc_qstats_drop(sch);
>> -               kfree_skb_reason(skb, reason);
>> +               qdisc_dequeue_drop(sch, skb, reason);
>>                  if (q->rate_flags & CAKE_FLAG_INGRESS)
>>                          goto retry;
>>          }
>> @@ -2569,6 +2569,8 @@ static void cake_reconfigure(struct Qdisc *sch)
>>
>>          sch->flags &= ~TCQ_F_CAN_BYPASS;
>>
>> +       sch->flags |= TCQ_F_DEQUEUE_DROPS;
>> +
>>          q->buffer_limit = min(q->buffer_limit,
>>                                max(sch->limit * psched_mtu(qdisc_dev(sch)),
>>                                    q->buffer_config_limit));
>
> Thanks for the patches. I experimented with these patches and these are the results:
> Running UDP flood (~21 Mpps load) over 2 minutes.
> pre-patch (baseline):
>    cake achieves stable packet rate of 0.476 Mpps
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 3593552166 bytes 56149224 pkt (dropped 2183, overlimits 0
>          requeues 311)
>        backlog 0b 0p requeues 311
>        memory used: 15503616b of 15140Kb
>
> net-next/main:
>    cake throughput drops from 0.61 Mpps to 0.15 Mpps (the expected collapse
>    we've seen before)
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 1166199994 bytes 18221827 pkt (dropped 71317773, overlimits 0
>          requeues 1914)
>        backlog 0b 0p requeues 1914
>        memory used: 15504576b of 15140Kb
>
> net-next/main + this dequeue patch:
>    cake throughput drops in the first 6 seconds from 0.65 Mpps  and then
>    oscillates between 0.27–0.36 Mpps
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 2627464378 bytes 41054083 pkt (dropped 50102108, overlimits 0
>          requeues 1008)
>        backlog 0b 0p requeues 1008
>        memory used: 15503616b of 15140Kb
>
> net-next/main + this dequeue patch + limiting the number of deferred packets:
>    cake throughput drops in the first 6 seconds from 0.65 Mpps  and then
>    oscillates between 0.35–0.43 Mpps
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 2969529126 bytes 46398843 pkt (dropped 43618919, overlimits 0
>          requeues 814)
>        backlog 0b 0p requeues 814
>        memory used: 15503616b of 15140Kb
>
> I can provide the full throughput traces if needed.
> So the last patches definitely mitigate cake's performance degradation.

I also did a series of test runs with the variant in Eric's v2 patch set
that includes the dequeue side improvement, and it definitely helps, to
the point where I can now get up to 50% higher PPS with a cake instance
serving multiple flows.

However, it's unstable: if I add enough flows I end up back in a state
where throughput collapses to a dozen Kpps. The exact number of flows
where this happens varies a bit, but it's quite distinct when it does.

The number of flows it takes before things break down varies a bit; I
have not found a definite cause for this yet, but I'll keep looking.

-Toke

[0] https://lore.kernel.org/r/20251111093204.1432437-1-edumazet@google.com


  reply	other threads:[~2025-11-11 16:42 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-13 14:54 [PATCH v1 net-next 0/5] net: optimize TX throughput and efficiency Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 1/5] net: add add indirect call wrapper in skb_release_head_state() Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 2/5] net/sched: act_mirred: add loop detection Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 3/5] Revert "net/sched: Fix mirred deadlock on device recursion" Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 4/5] net: sched: claim one cache line in Qdisc Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 5/5] net: dev_queue_xmit() llist adoption Eric Dumazet
2025-11-07 15:28   ` Toke Høiland-Jørgensen
2025-11-07 15:37     ` Eric Dumazet
2025-11-07 15:46       ` Eric Dumazet
2025-11-09 10:09         ` Eric Dumazet
2025-11-09 12:54           ` Eric Dumazet
2025-11-09 16:33             ` Toke Høiland-Jørgensen
2025-11-09 17:14               ` Eric Dumazet
2025-11-09 19:18               ` Jonas Köppeler
2025-11-09 19:28                 ` Eric Dumazet
2025-11-09 20:18                   ` Eric Dumazet
2025-11-09 20:29                     ` Eric Dumazet
2025-11-10 11:31                       ` Toke Høiland-Jørgensen
2025-11-10 13:26                         ` Eric Dumazet
2025-11-10 14:49                           ` Toke Høiland-Jørgensen
2025-11-10 17:34                             ` Eric Dumazet
2025-11-11 13:44                               ` Jonas Köppeler
2025-11-11 16:42                                 ` Toke Høiland-Jørgensen [this message]
2025-10-13 16:23 ` [PATCH v1 net-next 0/5] net: optimize TX throughput and efficiency Toke Høiland-Jørgensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87ms4sldwg.fsf@toke.dk \
    --to=toke@redhat$(echo .)com \
    --cc=davem@davemloft$(echo .)net \
    --cc=edumazet@google$(echo .)com \
    --cc=eric.dumazet@gmail$(echo .)com \
    --cc=horms@kernel$(echo .)org \
    --cc=j.koeppeler@tu-berlin$(echo .)de \
    --cc=jhs@mojatatu$(echo .)com \
    --cc=jiri@resnulli$(echo .)us \
    --cc=kuba@kernel$(echo .)org \
    --cc=kuniyu@google$(echo .)com \
    --cc=netdev@vger$(echo .)kernel.org \
    --cc=pabeni@redhat$(echo .)com \
    --cc=willemb@google$(echo .)com \
    --cc=xiyou.wangcong@gmail$(echo .)com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox