public inbox for netdev@vger.kernel.org 
 help / color / mirror / Atom feed
From: Eric Dumazet <dada1@cosmosbay•com>
To: Stephen Hemminger <shemminger@vyatta•com>
Cc: David Miller <davem@davemloft•net>,
	Patrick McHardy <kaber@trash•net>,
	Rick Jones <rick.jones2@hp•com>,
	"Paul E. McKenney" <paulmck@linux•vnet.ibm.com>,
	netdev@vger•kernel.org, netfilter-devel@vger•kernel.org
Subject: Re: [RFT 3/3] iptables: lock free counters
Date: Wed, 04 Feb 2009 04:10:22 +0100	[thread overview]
Message-ID: <4989071E.5000803@cosmosbay.com> (raw)
In-Reply-To: <20090204001755.808036408@vyatta.com>

Stephen Hemminger a écrit :
> Make netfilter tables that use x_tables (iptables, ip6tables, arptables)
> operatate without locking on the receive path.
> Replace existing reader/writer lock with Read-Copy-Update to
> elminate the overhead of a read lock on each incoming packet.
> This should reduce the overhead of iptables especially on SMP
> systems.
> 
> The previous code used a reader-writer lock for two purposes.
> The first was to ensure that the xt_table_info reference was not in
> process of being changed. Since xt_table_info is only freed via one
> routine, it was a direct conversion to RCU.
> 
> The other use of the reader-writer lock was to to block changes
> to counters while they were being read. This is handled by swapping in
> a new set of counter values and then summing the old ones. The sum
> is then restored back on a single cpu.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta•com>
> 
> ---
>  include/linux/netfilter/x_tables.h |   13 ++++
>  net/ipv4/netfilter/arp_tables.c    |   92 ++++++++++++++++++++++++-----------
>  net/ipv4/netfilter/ip_tables.c     |   97 ++++++++++++++++++++++++-------------
>  net/ipv6/netfilter/ip6_tables.c    |   97 +++++++++++++++++++++++--------------
>  net/netfilter/x_tables.c           |   67 +++++++++++++++++++++----
>  5 files changed, 258 insertions(+), 108 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-02-03 15:44:21.743663216 -0800
> @@ -353,7 +353,7 @@ struct xt_table
>  	unsigned int valid_hooks;
>  
>  	/* Lock for the curtain */
> -	rwlock_t lock;
> +	struct mutex lock;
>  
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> @@ -383,9 +383,15 @@ struct xt_table_info
>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>  	unsigned int underflow[NF_INET_NUMHOOKS];
>  
> +	/* For the dustman... */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> -	char *entries[1];
> +	void *entries[1];
>  };
>  
>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> @@ -432,6 +438,9 @@ extern void xt_proto_fini(struct net *ne
>  
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> +extern void xt_zero_table_entries(struct xt_table_info *info);
> +extern void xt_swap_table_entries(struct xt_table_info *old,
> +				  struct xt_table_info *new);
>  
>  #ifdef CONFIG_COMPAT
>  #include <net/compat.h>
> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-03 15:52:32.047583686 -0800
> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> -	table_base = (void *)private->entries[smp_processor_id()];
> +
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(table->private);
> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
>  	/* For return from builtin chain */
> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>  		}
>  	} while (!hotdrop);
>  
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -924,13 +926,45 @@ get_counters(const struct xt_table_info 
>  				  counters,
>  				  &i);
>  	}
> +
> +}
> +
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
> +/* Take values from counters and add them back onto the current cpu */
> +static void put_counters(struct xt_table_info *t,
> +			 const struct xt_counters counters[])
> +{
> +	unsigned int i, cpu;
> +
> +	local_bh_disable();
> +	cpu = smp_processor_id();
> +	i = 0;
> +	IPT_ENTRY_ITERATE(t->entries[cpu],
> +			  t->size,
> +			  add_counter_to_entry,
> +			  counters,
> +			  &i);
> +	local_bh_enable();
>  }
>  
>  static struct xt_counters * alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
> -	const struct xt_table_info *private = table->private;
> +	struct xt_table_info *private = table->private;
> +	struct xt_table_info *tmp;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		return ERR_PTR(-ENOMEM);
> +		goto nomem;
> +
> +	tmp = xt_alloc_table_info(private->size);
> +	if (!tmp)
> +		goto free_counters;
> +

> +	xt_zero_table_entries(tmp);
This is not correct. We must copy rules and zero counters on the copied stuff.

> +
> +	mutex_lock(&table->lock);
> +	xt_swap_table_entries(private, tmp);
> +	synchronize_net();	/* Wait until smoke has cleared */
>  
> -	/* First, sum counters... */
> -	write_lock_bh(&table->lock);
> -	get_counters(private, counters);
> -	write_unlock_bh(&table->lock);
> +	get_counters(tmp, counters);

Yes, tmp now hold previous pointers

> +	put_counters(private, counters);
> +	mutex_unlock(&table->lock);
> +
> +	xt_free_table_info(tmp);
>  
>  	return counters;
> +
> + free_counters:
> +	vfree(counters);
> + nomem:
> +	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1312,27 +1362,6 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
>  
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -#if 0
> -	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
> -		 *i,
> -		 (long unsigned int)e->counters.pcnt,
> -		 (long unsigned int)e->counters.bcnt,
> -		 (long unsigned int)addme[*i].pcnt,
> -		 (long unsigned int)addme[*i].bcnt);
> -#endif
> -
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
>  
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
>  
> -	write_lock_bh(&t->lock);
> +	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> +	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
>  	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> @@ -1408,8 +1438,9 @@ do_add_counters(struct net *net, void __
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> +	preempt_enable();
>   unlock_up_free:
> -	write_unlock_bh(&t->lock);
> +	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/netfilter/x_tables.c	2009-02-02 15:06:29.708249745 -0800
> +++ b/net/netfilter/x_tables.c	2009-02-03 15:44:21.743663216 -0800
> @@ -611,18 +611,61 @@ struct xt_table_info *xt_alloc_table_inf
>  }
>  EXPORT_SYMBOL(xt_alloc_table_info);
>  
> -void xt_free_table_info(struct xt_table_info *info)
> +/* Zero out entries */
> +void xt_zero_table_entries(struct xt_table_info *info)
>  {
> -	int cpu;
> +	unsigned int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		memset(info->entries[cpu], 0, info->size);
> +}
> +EXPORT_SYMBOL_GPL(xt_zero_table_entries);

Hum, you forgot entries[cpu] points to quite complex data set,
with iptables rules, countaining counters...

Only counters must be zeroed, one by one.

You thus need an ITERATE invocation...

I wont be able to make the incremental patch (too busy @work at this moment),
I am sorry :(

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger•kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

  parent reply	other threads:[~2009-02-04  3:10 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20090204001202.724266235@vyatta.com>
2009-02-04  1:40 ` [RFT 0/3] netfilter: lock free tables Rick Jones
     [not found] ` <20090204001755.808036408@vyatta.com>
2009-02-04  3:10   ` Eric Dumazet [this message]
2009-02-09 15:52     ` [RFT 3/3] iptables: lock free counters Patrick McHardy
2009-02-09 17:14       ` Stephen Hemminger
2009-02-10 17:52         ` [RFC] iptables: lock free counters (v0.6) Stephen Hemminger
2009-02-10 22:14           ` Ranjit Manomohan
2009-02-10 22:20           ` Rick Jones
2009-02-09 15:58   ` [RFT 3/3] iptables: lock free counters Patrick McHardy
     [not found] ` <20090204001755.549902016@vyatta.com>
2009-02-09 16:27   ` [RFT 1/3] netfilter: change elements in x_tables Patrick McHardy
     [not found] ` <20090204001755.685385465@vyatta.com>
2009-02-09 15:37   ` [RFT 2/3] netfilter: remove unneeded initializations Patrick McHardy
2009-02-09 16:23     ` Stephen Hemminger
2009-02-09 16:25       ` Patrick McHardy
2009-02-09 16:24     ` [PATCH] ebtables: " Stephen Hemminger
2009-02-09 16:30       ` Patrick McHardy
2009-02-09 16:28   ` [RFT 2/3] netfilter: " Patrick McHardy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4989071E.5000803@cosmosbay.com \
    --to=dada1@cosmosbay$(echo .)com \
    --cc=davem@davemloft$(echo .)net \
    --cc=kaber@trash$(echo .)net \
    --cc=netdev@vger$(echo .)kernel.org \
    --cc=netfilter-devel@vger$(echo .)kernel.org \
    --cc=paulmck@linux$(echo .)vnet.ibm.com \
    --cc=rick.jones2@hp$(echo .)com \
    --cc=shemminger@vyatta$(echo .)com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox