The seqcount xt_recseq is used to synchronize the replacement of xt_table::private in xt_replace_table() against all readers such as ipt_do_table(). After the pointer is replaced, xt_register_target() iterates over all per-CPU xt_recseq to ensure that none of CPUs is within the critical section. Once this is done, the old pointer can be examined and deallocated safely. This can also be achieved with RCU: Each reader of the private pointer will be with in an RCU read section. The new pointer will be published with rcu_assign_pointer() and synchronize_rcu() is used to wait until each reader left its critical section. Should this lead to a drop in performance due synchronize_rcu() in the replacement, there are possible workarounds: - Use iptables-legacy-restore instead multiple iptables-legacy staments - Use iptables-nft instead iptables-legacy - Don't copy the counters after the replacement at all or simply don't wait for the in-flight counters and just copy what is there. Use RCU to assign xt_table::private and synchronise against reader. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> --- include/linux/netfilter/x_tables.h | 8 ++++- net/ipv4/netfilter/arp_tables.c | 20 +++++++----- net/ipv4/netfilter/ip_tables.c | 20 +++++++----- net/ipv6/netfilter/ip6_tables.c | 20 +++++++----- net/netfilter/x_tables.c | 50 ++++++++++++------------------ 5 files changed, 62 insertions(+), 56 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index f39f688d72852..b9cd82e845d08 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -227,7 +227,7 @@ struct xt_table { unsigned int valid_hooks; /* Man behind the curtain... */ - struct xt_table_info *private; + struct xt_table_info __rcu *priv_info; /* hook ops that register the table with the netfilter core */ struct nf_hook_ops *ops; @@ -345,6 +345,12 @@ void xt_free_table_info(struct xt_table_info *info); */ DECLARE_PER_CPU(seqcount_t, xt_recseq); +bool xt_af_lock_held(u_int8_t af); +static inline struct xt_table_info *nf_table_private(const struct xt_table *table) +{ + return rcu_dereference_check(table->priv_info, xt_af_lock_held(table->af)); +} + /* xt_tee_enabled - true if x_tables needs to handle reentrancy * * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 1cdd9c28ab2da..0628e68910f7f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -203,8 +203,9 @@ unsigned int arpt_do_table(void *priv, outdev = state->out ? state->out->name : nulldevname; local_bh_disable(); + rcu_read_lock(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_dereference(table->priv_info); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -279,6 +280,7 @@ unsigned int arpt_do_table(void *priv, } } while (!acpar.hotdrop); xt_write_recseq_end(addend); + rcu_read_unlock(); local_bh_enable(); if (acpar.hotdrop) @@ -648,9 +650,9 @@ static void get_old_counters(const struct xt_table_info *t, static struct xt_counters *alloc_counters(const struct xt_table *table) { + const struct xt_table_info *private = nf_table_private(table); unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -671,10 +673,10 @@ static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { + struct xt_table_info *private = nf_table_private(table); unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; int ret = 0; void *loc_cpu_entry; @@ -808,7 +810,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; @@ -861,7 +863,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1022,7 +1024,8 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + rcu_read_lock(); + private = rcu_dereference(t->priv_info); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1040,6 +1043,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } xt_write_recseq_end(addend); unlock_up_free: + rcu_read_unlock(); local_bh_enable(); xt_table_unlock(t); module_put(t->me); @@ -1340,8 +1344,8 @@ static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { + const struct xt_table_info *private = nf_table_private(table); struct xt_counters *counters; - const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; @@ -1390,7 +1394,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); struct xt_table_info info; ret = compat_table_info(private, &info); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27fa..20e8b46af8876 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -256,8 +256,9 @@ ipt_do_table(void *priv, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); + rcu_read_lock(); + private = rcu_dereference(table->priv_info); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -354,6 +355,7 @@ ipt_do_table(void *priv, } while (!acpar.hotdrop); xt_write_recseq_end(addend); + rcu_read_unlock(); local_bh_enable(); if (acpar.hotdrop) @@ -788,9 +790,9 @@ static void get_old_counters(const struct xt_table_info *t, static struct xt_counters *alloc_counters(const struct xt_table *table) { + const struct xt_table_info *private = nf_table_private(table); unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -811,10 +813,10 @@ copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { + const struct xt_table_info *private = nf_table_private(table); unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; @@ -963,7 +965,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; @@ -1017,7 +1019,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1175,7 +1177,8 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + rcu_read_lock(); + private = rcu_dereference(t->priv_info); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1192,6 +1195,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } xt_write_recseq_end(addend); unlock_up_free: + rcu_read_unlock(); local_bh_enable(); xt_table_unlock(t); module_put(t->me); @@ -1550,8 +1554,8 @@ static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { + const struct xt_table_info *private = nf_table_private(table); struct xt_counters *counters; - const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; @@ -1597,7 +1601,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae72..c12d489a09840 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -278,8 +278,9 @@ ip6t_do_table(void *priv, struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); + rcu_read_lock(); + private = rcu_dereference(table->priv_info); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; @@ -372,6 +373,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, } while (!acpar.hotdrop); xt_write_recseq_end(addend); + rcu_read_unlock(); local_bh_enable(); if (acpar.hotdrop) @@ -804,9 +806,9 @@ static void get_old_counters(const struct xt_table_info *t, static struct xt_counters *alloc_counters(const struct xt_table *table) { + const struct xt_table_info *private = nf_table_private(table); unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -827,10 +829,10 @@ copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { + const struct xt_table_info *private = nf_table_private(table); unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; @@ -979,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; @@ -1034,7 +1036,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - struct xt_table_info *private = t->private; + struct xt_table_info *private = nf_table_private(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1191,7 +1193,8 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + rcu_read_lock(); + private = rcu_dereference(t->priv_info); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1208,6 +1211,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } xt_write_recseq_end(addend); unlock_up_free: + rcu_read_unlock(); local_bh_enable(); xt_table_unlock(t); module_put(t->me); @@ -1559,8 +1563,8 @@ static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { + const struct xt_table_info *private = nf_table_private(table); struct xt_counters *counters; - const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; @@ -1606,7 +1610,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = nf_table_private(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 709840612f0df..ee38272cca562 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -85,6 +85,19 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_IPV6] = "ip6", }; +#ifdef CONFIG_LOCKDEP +bool xt_af_lock_held(u_int8_t af) +{ + return lockdep_is_held(&xt[af].mutex) || +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT + lockdep_is_held(&xt[af].compat_mutex); +#else + false; +#endif +} +EXPORT_SYMBOL_GPL(xt_af_lock_held); +#endif + /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { @@ -1388,7 +1401,6 @@ xt_replace_table(struct xt_table *table, int *error) { struct xt_table_info *private; - unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); @@ -1397,48 +1409,24 @@ xt_replace_table(struct xt_table *table, return NULL; } - /* Do the substitution. */ - local_bh_disable(); - private = table->private; + private = nf_table_private(table); /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - local_bh_enable(); *error = -EAGAIN; return NULL; } newinfo->initial_entries = private->initial_entries; - /* - * Ensure contents of newinfo are visible before assigning to - * private. - */ - smp_wmb(); - table->private = newinfo; - - /* make sure all cpus see new ->private value */ - smp_mb(); + rcu_assign_pointer(table->priv_info, newinfo); /* * Even though table entries have now been swapped, other CPU's * may still be using the old entries... */ - local_bh_enable(); - - /* ... so wait for even xt_recseq on all cpus */ - for_each_possible_cpu(cpu) { - seqcount_t *s = &per_cpu(xt_recseq, cpu); - u32 seq = raw_read_seqcount(s); - - if (seq & 1) { - do { - cond_resched(); - cpu_relax(); - } while (seq == raw_read_seqcount(s)); - } - } + synchronize_rcu(); audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : @@ -1475,12 +1463,12 @@ struct xt_table *xt_register_table(struct net *net, } /* Simplifies replace_table code. */ - table->private = bootstrap; + rcu_assign_pointer(table->priv_info, bootstrap); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; - private = table->private; + private = nf_table_private(table); pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ @@ -1503,7 +1491,7 @@ void *xt_unregister_table(struct xt_table *table) struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); - private = table->private; + private = nf_table_private(table); list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, -- 2.47.2