From: Vlad Yasevich <vyasevich@gmail•com>
To: Hannes Frederic Sowa <hannes@stressinduktion•org>
Cc: David Miller <davem@davemloft•net>,
netdev@vger•kernel.org, eric.dumazet@gmail•com,
nicolas.dichtel@6wind•com
Subject: Re: [PATCH net-next] ipv6: implement rt_genid_bump_ipv6 with fn_sernum and remove rt6i_genid
Date: Thu, 11 Sep 2014 10:44:04 -0400 [thread overview]
Message-ID: <5411B534.3060303@gmail.com> (raw)
In-Reply-To: <1410445959.18873.37.camel@localhost>
On 09/11/2014 10:32 AM, Hannes Frederic Sowa wrote:
> On Do, 2014-09-11 at 10:19 -0400, Vlad Yasevich wrote:
>> On 09/11/2014 08:05 AM, Hannes Frederic Sowa wrote:
>>> On Mi, 2014-09-10 at 13:09 -0700, David Miller wrote:
>>>> From: Hannes Frederic Sowa <hannes@stressinduktion•org>
>>>> Date: Wed, 10 Sep 2014 11:31:28 +0200
>>>>
>>>>> In case we need to force the sockets to relookup the routes we now
>>>>> increase the fn_sernum on all fibnodes in the routing tree. This is a
>>>>> costly operation but should only happen if we have major routing/policy
>>>>> changes in the kernel (e.g. manual route adding/removal, xfrm policy
>>>>> changes).
>>>>
>>>> Core routers can update thousands of route updates per second, and they
>>>> do this via what you refer to as "manual route adding/removal".
>>>>
>>>> I don't think we want to put such a scalability problem into the tree.
>>>>
>>>> There has to be a lightweight way to address this.
>>>
>>> An alternative approach without traversing the routing table, but each
>>> newly inserted route (even only cached ones) might bump all other routes
>>> out of the per-socket caches:
>>>
>>> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
>>> index 9bcb220..a7e45b9 100644
>>> --- a/include/net/ip6_fib.h
>>> +++ b/include/net/ip6_fib.h
>>> @@ -119,8 +119,6 @@ struct rt6_info {
>>> struct inet6_dev *rt6i_idev;
>>> unsigned long _rt6i_peer;
>>>
>>> - u32 rt6i_genid;
>>> -
>>> /* more non-fragment space at head required */
>>> unsigned short rt6i_nfheader_len;
>>>
>>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
>>> index 361d260..428fdcb 100644
>>> --- a/include/net/net_namespace.h
>>> +++ b/include/net/net_namespace.h
>>> @@ -358,18 +358,28 @@ static inline int rt_genid_ipv6(struct net *net)
>>> return atomic_read(&net->ipv6.rt_genid);
>>> }
>>>
>>> -static inline void rt_genid_bump_ipv6(struct net *net)
>>> +static inline int rt_genid_bump_ipv6(struct net *net)
>>> {
>>> - atomic_inc(&net->ipv6.rt_genid);
>>> + int new, old;
>>> +
>>> + do {
>>> + old = atomic_read(&net->ipv6.rt_genid);
>>> + new = old + 1;
>>> + if (new <= 0)
>>> + new = 1;
>>> + } while (atomic_cmpxchg(&net->ipv6.rt_genid, old, new) != old);
>>> + return new;
>>> +
>>> }
>>> #else
>>> static inline int rt_genid_ipv6(struct net *net)
>>> {
>>> - return 0;
>>> + return 1;
>>> }
>>>
>>> -static inline void rt_genid_bump_ipv6(struct net *net)
>>> +static inline int rt_genid_bump_ipv6(struct net *net)
>>> {
>>> + return 1;
>>> }
>>> #endif
>>>
>>> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
>>> index 76b7f5e..4a2f130 100644
>>> --- a/net/ipv6/ip6_fib.c
>>> +++ b/net/ipv6/ip6_fib.c
>>> @@ -84,7 +84,10 @@ static int fib6_walk_continue(struct fib6_walker_t *w);
>>> * result of redirects, path MTU changes, etc.
>>> */
>>>
>>> -static __u32 rt_sernum;
>>> +static int fib6_new_sernum(struct net *net)
>>> +{
>>> + return rt_genid_bump_ipv6(net);
>>> +}
>>>
>>> static void fib6_gc_timer_cb(unsigned long arg);
>>>
>>> @@ -104,13 +107,6 @@ static inline void fib6_walker_unlink(struct fib6_walker_t *w)
>>> list_del(&w->lh);
>>> write_unlock_bh(&fib6_walker_lock);
>>> }
>>> -static __inline__ u32 fib6_new_sernum(void)
>>> -{
>>> - u32 n = ++rt_sernum;
>>> - if ((__s32)n <= 0)
>>> - rt_sernum = n = 1;
>>> - return n;
>>> -}
>>>
>>> /*
>>> * Auxiliary address test functions for the radix tree.
>>> @@ -421,16 +417,15 @@ out:
>>> */
>>>
>>> static struct fib6_node *fib6_add_1(struct fib6_node *root,
>>> - struct in6_addr *addr, int plen,
>>> - int offset, int allow_create,
>>> - int replace_required)
>>> + struct in6_addr *addr, int plen,
>>> + int offset, int allow_create,
>>> + int replace_required, int sernum)
>>> {
>>> struct fib6_node *fn, *in, *ln;
>>> struct fib6_node *pn = NULL;
>>> struct rt6key *key;
>>> int bit;
>>> __be32 dir = 0;
>>> - __u32 sernum = fib6_new_sernum();
>>>
>>> RT6_TRACE("fib6_add_1\n");
>>>
>>> @@ -844,6 +839,7 @@ void fib6_force_start_gc(struct net *net)
>>> int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>>> struct nlattr *mx, int mx_len)
>>> {
>>> + struct net *net = dev_net(rt->dst.dev);
>>> struct fib6_node *fn, *pn = NULL;
>>> int err = -ENOMEM;
>>> int allow_create = 1;
>>> @@ -860,7 +856,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>>>
>>> fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
>>> offsetof(struct rt6_info, rt6i_dst), allow_create,
>>> - replace_required);
>>> + replace_required, fib6_new_sernum(net));
>>> if (IS_ERR(fn)) {
>>> err = PTR_ERR(fn);
>>> fn = NULL;
>>> @@ -894,14 +890,15 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>>> sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
>>> atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
>>> sfn->fn_flags = RTN_ROOT;
>>> - sfn->fn_sernum = fib6_new_sernum();
>>> + sfn->fn_sernum = fib6_new_sernum(net);
>>>
>>> /* Now add the first leaf node to new subtree */
>>>
>>> sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
>>> rt->rt6i_src.plen,
>>> offsetof(struct rt6_info, rt6i_src),
>>> - allow_create, replace_required);
>>> + allow_create, replace_required,
>>> + fib6_new_sernum(net));
>>>
>>> if (IS_ERR(sn)) {
>>> /* If it is failed, discard just allocated
>>> @@ -920,7 +917,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>>> sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
>>> rt->rt6i_src.plen,
>>> offsetof(struct rt6_info, rt6i_src),
>>> - allow_create, replace_required);
>>> + allow_create, replace_required,
>>> + fib6_new_sernum(net));
>>>
>>> if (IS_ERR(sn)) {
>>> err = PTR_ERR(sn);
>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>>> index f74b041..54b7d81 100644
>>> --- a/net/ipv6/route.c
>>> +++ b/net/ipv6/route.c
>>> @@ -314,7 +314,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
>>>
>>> memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
>>> rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
>>> - rt->rt6i_genid = rt_genid_ipv6(net);
>>> INIT_LIST_HEAD(&rt->rt6i_siblings);
>>> }
>>> return rt;
>>> @@ -1096,10 +1095,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
>>> * DST_OBSOLETE_FORCE_CHK which forces validation calls down
>>> * into this function always.
>>> */
>>> - if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev)))
>>> - return NULL;
>>> -
>>> - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
>>> + if (!rt->rt6i_node || rt_genid_ipv6(dev_net(rt->dst.dev)) != cookie)
>>> return NULL;
>>>
>>> if (rt6_check_expired(rt))
>>>
>>
>> Ok, so now we bump the gen_id every time we add a route and use that as fn_sernum for
>> that route. But this doesn't solve the problem that we are seeing in that a re-lookup
>> of the route still gives us an older route with an older gen_id.
>
> Hmm, this patch completely removes rt6i_genid from rt6_info (first
> hunk). We decide if we need to do the relookup based on the socket's
> cookie and the per-netns serial number of the ipv6 routing tables.
>
Hi Hannes
Right, by you still compare the per-netns serial number to the cookie. The cookie is
typically taken from the route. The route takes it from the per-netns serial number.
So, if you add a route with serial=1 and when the socket looks up that route, it
stashes 1 into the cookie.
Then you add another route and serial = 2. Now the dst_check on the socket fails
(cookie != rt_getid_ipv6(..)) and socket has to re-lookup a route. It gets the same old
route as before with fn_sernum = 1. That's stashed in the cookie again. Next dst_check
is done, the route is invalidated again!.
Am I missing something?
Thanks
-vlad
> Bye,
> Hannes
>
>
next prev parent reply other threads:[~2014-09-11 14:44 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-08-14 18:19 Performance regression on kernels 3.10 and newer Alexander Duyck
2014-08-14 18:46 ` Eric Dumazet
2014-08-14 19:50 ` Eric Dumazet
2014-08-14 19:59 ` Rick Jones
2014-08-14 20:31 ` Alexander Duyck
2014-08-14 20:51 ` Eric Dumazet
2014-08-14 20:46 ` Eric Dumazet
2014-08-14 23:16 ` Alexander Duyck
2014-08-14 23:20 ` David Miller
2014-08-14 23:25 ` Tom Herbert
2014-08-21 23:24 ` David Miller
2014-09-06 14:45 ` Eric Dumazet
2014-09-06 15:27 ` Eric Dumazet
2014-09-06 15:46 ` Eric Dumazet
2014-09-06 16:38 ` Eric Dumazet
2014-09-06 18:21 ` Eric Dumazet
2014-09-07 19:05 ` [PATCH net] ipv6: refresh rt6i_genid in ip6_pol_route() Eric Dumazet
2014-09-07 22:54 ` David Miller
2014-09-08 4:18 ` Eric Dumazet
2014-09-08 4:27 ` David Miller
2014-09-08 4:43 ` Eric Dumazet
2014-09-08 4:59 ` David Miller
2014-09-08 5:07 ` Eric Dumazet
2014-09-08 8:11 ` Nicolas Dichtel
2014-09-08 10:28 ` Eric Dumazet
2014-09-08 12:16 ` Nicolas Dichtel
2014-09-08 18:48 ` Vlad Yasevich
2014-09-09 12:58 ` Hannes Frederic Sowa
2014-09-10 9:31 ` [PATCH net-next] ipv6: implement rt_genid_bump_ipv6 with fn_sernum and remove rt6i_genid Hannes Frederic Sowa
2014-09-10 13:26 ` Vlad Yasevich
2014-09-10 13:42 ` Hannes Frederic Sowa
2014-09-10 20:09 ` David Miller
2014-09-11 8:30 ` Hannes Frederic Sowa
2014-09-11 12:22 ` Vlad Yasevich
2014-09-11 12:40 ` Hannes Frederic Sowa
2014-09-11 12:05 ` Hannes Frederic Sowa
2014-09-11 14:19 ` Vlad Yasevich
2014-09-11 14:32 ` Hannes Frederic Sowa
2014-09-11 14:44 ` Vlad Yasevich [this message]
2014-09-11 14:47 ` Hannes Frederic Sowa
2014-09-08 15:06 ` [PATCH v2 net-next] tcp: remove dst refcount false sharing for prequeue mode Eric Dumazet
2014-09-08 21:21 ` David Miller
2014-09-08 21:30 ` Eric Dumazet
2014-09-08 22:41 ` David Miller
2014-09-09 23:56 ` David Miller
2014-08-15 17:15 ` Performance regression on kernels 3.10 and newer Alexander Duyck
2014-08-15 17:59 ` Eric Dumazet
2014-08-15 18:49 ` Tom Herbert
2014-08-15 19:10 ` Alexander Duyck
2014-08-15 22:16 ` Tom Herbert
2014-08-15 23:23 ` Alexander Duyck
2014-08-18 9:03 ` David Laight
2014-08-18 15:22 ` Alexander Duyck
2014-08-18 15:29 ` Rick Jones
2014-08-21 23:51 ` David Miller
2014-08-14 23:48 ` Eric Dumazet
2014-08-15 0:33 ` Rick Jones
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5411B534.3060303@gmail.com \
--to=vyasevich@gmail$(echo .)com \
--cc=davem@davemloft$(echo .)net \
--cc=eric.dumazet@gmail$(echo .)com \
--cc=hannes@stressinduktion$(echo .)org \
--cc=netdev@vger$(echo .)kernel.org \
--cc=nicolas.dichtel@6wind$(echo .)com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox