* [RFC PATCH net-next] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set
@ 2015-08-07 18:31 Jason Baron
2015-08-10 14:47 ` Eric Dumazet
0 siblings, 1 reply; 4+ messages in thread
From: Jason Baron @ 2015-08-07 18:31 UTC (permalink / raw)
To: davem, eric.dumazet; +Cc: netdev
From: Jason Baron <jbaron@akamai•com>
When SO_SNDBUF is set and we are under tcp memory pressure, the effective write
buffer space can be much lower than what was set using SO_SNDBUF. For example,
we may have set the buffer to 100kb, but we may only be able to write 10kb. In
this scenario poll()/select()/epoll(), are going to continuously return POLLOUT,
followed by -EAGAIN from write() in a very tight loop.
Introduce sk->sk_effective_sndbuf, such that we can track the 'effective' size
of the sndbuf, when we have a short write due to memory pressure. By using the
sk->sk_effective_sndbuf instead of the sk->sk_sndbuf when we are under memory
pressure, we can delay the POLLOUT until 1/3 of the buffer clears as we normally
do. There is no issue here when SO_SNDBUF is not set, since the tcp layer will
auto tune the sk->sndbuf.
In my testing, this brought a single threaad's cpu usage down from 100% to 1%
while maintaining the same level of throughput when under memory pressure.
Signed-off-by: Jason Baron <jbaron@akamai•com>
---
include/net/sock.h | 12 ++++++++++++
net/core/sock.c | 1 +
net/core/stream.c | 1 +
net/ipv4/tcp.c | 10 +++++++---
4 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 43c6abc..ca49415 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -380,6 +380,7 @@ struct sock {
atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
int sk_sndbuf;
+ int sk_effective_sndbuf;
struct sk_buff_head sk_write_queue;
kmemcheck_bitfield_begin(flags);
unsigned int sk_shutdown : 2,
@@ -779,6 +780,14 @@ static inline bool sk_acceptq_is_full(const struct sock *sk)
return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
}
+static inline void sk_set_effective_sndbuf(struct sock *sk)
+{
+ if (sk->sk_wmem_queued > sk->sk_sndbuf)
+ sk->sk_effective_sndbuf = sk->sk_sndbuf;
+ else
+ sk->sk_effective_sndbuf = sk->sk_wmem_queued;
+}
+
/*
* Compute minimal free write space needed to queue new packets.
*/
@@ -789,6 +798,9 @@ static inline int sk_stream_min_wspace(const struct sock *sk)
static inline int sk_stream_wspace(const struct sock *sk)
{
+ if (sk->sk_effective_sndbuf)
+ return sk->sk_effective_sndbuf - sk->sk_wmem_queued;
+
return sk->sk_sndbuf - sk->sk_wmem_queued;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 193901d..4fce879 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2309,6 +2309,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = sysctl_rmem_default;
sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_effective_sndbuf = 0;
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a..7c175e7 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -32,6 +32,7 @@ void sk_stream_write_space(struct sock *sk)
if (sk_stream_is_writeable(sk) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
+ sk->sk_effective_sndbuf = 0;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 45534a5..9e7f0a5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -845,6 +845,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
sk->sk_prot->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
+ sk_set_effective_sndbuf(sk);
return NULL;
}
@@ -939,9 +940,10 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (!sk_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy)) {
+ sk_set_effective_sndbuf(sk);
goto wait_for_memory;
-
+ }
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
@@ -1214,8 +1216,10 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (!sk_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy)) {
+ sk_set_effective_sndbuf(sk);
goto wait_for_memory;
+ }
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
--
1.8.2.rc2
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [RFC PATCH net-next] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set
2015-08-07 18:31 [RFC PATCH net-next] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set Jason Baron
@ 2015-08-10 14:47 ` Eric Dumazet
2015-08-10 17:29 ` Jason Baron
0 siblings, 1 reply; 4+ messages in thread
From: Eric Dumazet @ 2015-08-10 14:47 UTC (permalink / raw)
To: Jason Baron; +Cc: davem, netdev
On Fri, 2015-08-07 at 18:31 +0000, Jason Baron wrote:
> From: Jason Baron <jbaron@akamai•com>
>
> When SO_SNDBUF is set and we are under tcp memory pressure, the effective write
> buffer space can be much lower than what was set using SO_SNDBUF. For example,
> we may have set the buffer to 100kb, but we may only be able to write 10kb. In
> this scenario poll()/select()/epoll(), are going to continuously return POLLOUT,
> followed by -EAGAIN from write() in a very tight loop.
>
> Introduce sk->sk_effective_sndbuf, such that we can track the 'effective' size
> of the sndbuf, when we have a short write due to memory pressure. By using the
> sk->sk_effective_sndbuf instead of the sk->sk_sndbuf when we are under memory
> pressure, we can delay the POLLOUT until 1/3 of the buffer clears as we normally
> do. There is no issue here when SO_SNDBUF is not set, since the tcp layer will
> auto tune the sk->sndbuf.
>
> In my testing, this brought a single threaad's cpu usage down from 100% to 1%
> while maintaining the same level of throughput when under memory pressure.
>
I am not sure we need to grow socket for something that looks like a
flag ?
Also you add a race in sk_stream_wspace() as sk_effective_sndbuf value
can change under us.
+ if (sk->sk_effective_sndbuf)
+ return sk->sk_effective_sndbuf - sk->sk_wmem_queued;
+
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC PATCH net-next] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set
2015-08-10 14:47 ` Eric Dumazet
@ 2015-08-10 17:29 ` Jason Baron
2015-08-10 21:26 ` Eric Dumazet
0 siblings, 1 reply; 4+ messages in thread
From: Jason Baron @ 2015-08-10 17:29 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, netdev
On 08/10/2015 10:47 AM, Eric Dumazet wrote:
> On Fri, 2015-08-07 at 18:31 +0000, Jason Baron wrote:
>> From: Jason Baron <jbaron@akamai•com>
>>
>> When SO_SNDBUF is set and we are under tcp memory pressure, the effective write
>> buffer space can be much lower than what was set using SO_SNDBUF. For example,
>> we may have set the buffer to 100kb, but we may only be able to write 10kb. In
>> this scenario poll()/select()/epoll(), are going to continuously return POLLOUT,
>> followed by -EAGAIN from write() in a very tight loop.
>>
>> Introduce sk->sk_effective_sndbuf, such that we can track the 'effective' size
>> of the sndbuf, when we have a short write due to memory pressure. By using the
>> sk->sk_effective_sndbuf instead of the sk->sk_sndbuf when we are under memory
>> pressure, we can delay the POLLOUT until 1/3 of the buffer clears as we normally
>> do. There is no issue here when SO_SNDBUF is not set, since the tcp layer will
>> auto tune the sk->sndbuf.
>>
>> In my testing, this brought a single threaad's cpu usage down from 100% to 1%
>> while maintaining the same level of throughput when under memory pressure.
>>
>
> I am not sure we need to grow socket for something that looks like a
> flag ?
>
So I added a new field because I needed to store the new 'effective'
sndbuf somewhere and then restore the original value that was set via
SO_SNDBUF. So its really b/c of SO_SNDBUF. We could perhaps use the fact
that we are in memory pressure to signal wakeups differently, but I'm
not sure exactly how.
> Also you add a race in sk_stream_wspace() as sk_effective_sndbuf value
> can change under us.
>
> + if (sk->sk_effective_sndbuf)
> + return sk->sk_effective_sndbuf - sk->sk_wmem_queued;
> +
>
>
>
>
thanks. better?
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -798,8 +798,10 @@ static inline int sk_stream_min_wspace(const struct
sock *sk)
static inline int sk_stream_wspace(const struct sock *sk)
{
- if (sk->sk_effective_sndbuf)
- return sk->sk_effective_sndbuf - sk->sk_wmem_queued;
+ int effective_sndbuf = sk->sk_effective_sndbuf;
+
+ if (effective_sndbuf)
+ return effective_sndbuf - sk->sk_wmem_queued;
return sk->sk_sndbuf - sk->sk_wmem_queued;
}
Thanks,
-Jason
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [RFC PATCH net-next] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set
2015-08-10 17:29 ` Jason Baron
@ 2015-08-10 21:26 ` Eric Dumazet
0 siblings, 0 replies; 4+ messages in thread
From: Eric Dumazet @ 2015-08-10 21:26 UTC (permalink / raw)
To: Jason Baron; +Cc: davem, netdev
On Mon, 2015-08-10 at 13:29 -0400, Jason Baron wrote:
> > +
> >
> >
> >
> >
>
> thanks. better?
>
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -798,8 +798,10 @@ static inline int sk_stream_min_wspace(const struct
> sock *sk)
>
> static inline int sk_stream_wspace(const struct sock *sk)
> {
> - if (sk->sk_effective_sndbuf)
> - return sk->sk_effective_sndbuf - sk->sk_wmem_queued;
> + int effective_sndbuf = sk->sk_effective_sndbuf;
> +
> + if (effective_sndbuf)
> + return effective_sndbuf - sk->sk_wmem_queued;
>
> return sk->sk_sndbuf - sk->sk_wmem_queued;
> }
>
>
You need to use instead :
int effective_sndbuf = READ_ONCE(sk->sk_effective_sndbuf);
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2015-08-10 21:26 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-07 18:31 [RFC PATCH net-next] tcp: reduce cpu usage under tcp memory pressure when SO_SNDBUF is set Jason Baron
2015-08-10 14:47 ` Eric Dumazet
2015-08-10 17:29 ` Jason Baron
2015-08-10 21:26 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox