Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / netfilter / nf_conntrack_expect.c
1 /* Expectation handling for nf_conntrack. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11
12 #include <linux/types.h>
13 #include <linux/netfilter.h>
14 #include <linux/skbuff.h>
15 #include <linux/proc_fs.h>
16 #include <linux/seq_file.h>
17 #include <linux/stddef.h>
18 #include <linux/slab.h>
19 #include <linux/err.h>
20 #include <linux/percpu.h>
21 #include <linux/kernel.h>
22 #include <linux/jhash.h>
23 #include <net/net_namespace.h>
24
25 #include <net/netfilter/nf_conntrack.h>
26 #include <net/netfilter/nf_conntrack_core.h>
27 #include <net/netfilter/nf_conntrack_expect.h>
28 #include <net/netfilter/nf_conntrack_helper.h>
29 #include <net/netfilter/nf_conntrack_tuple.h>
30
31 unsigned int nf_ct_expect_hsize __read_mostly;
32 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
33
34 static unsigned int nf_ct_expect_hash_rnd __read_mostly;
35 unsigned int nf_ct_expect_max __read_mostly;
36 static int nf_ct_expect_hash_rnd_initted __read_mostly;
37
38 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
39
40 /* nf_conntrack_expect helper functions */
41 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
42 {
43         struct nf_conn_help *master_help = nfct_help(exp->master);
44         struct net *net = nf_ct_exp_net(exp);
45
46         NF_CT_ASSERT(master_help);
47         NF_CT_ASSERT(!timer_pending(&exp->timeout));
48
49         hlist_del_rcu(&exp->hnode);
50         net->ct.expect_count--;
51
52         hlist_del(&exp->lnode);
53         master_help->expecting[exp->class]--;
54         nf_ct_expect_put(exp);
55
56         NF_CT_STAT_INC(net, expect_delete);
57 }
58 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
59
60 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
61 {
62         struct nf_conntrack_expect *exp = (void *)ul_expect;
63
64         spin_lock_bh(&nf_conntrack_lock);
65         nf_ct_unlink_expect(exp);
66         spin_unlock_bh(&nf_conntrack_lock);
67         nf_ct_expect_put(exp);
68 }
69
70 static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
71 {
72         unsigned int hash;
73
74         if (unlikely(!nf_ct_expect_hash_rnd_initted)) {
75                 get_random_bytes(&nf_ct_expect_hash_rnd, 4);
76                 nf_ct_expect_hash_rnd_initted = 1;
77         }
78
79         hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
80                       (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
81                        (__force __u16)tuple->dst.u.all) ^ nf_ct_expect_hash_rnd);
82         return ((u64)hash * nf_ct_expect_hsize) >> 32;
83 }
84
85 struct nf_conntrack_expect *
86 __nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple)
87 {
88         struct nf_conntrack_expect *i;
89         struct hlist_node *n;
90         unsigned int h;
91
92         if (!net->ct.expect_count)
93                 return NULL;
94
95         h = nf_ct_expect_dst_hash(tuple);
96         hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) {
97                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
98                         return i;
99         }
100         return NULL;
101 }
102 EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
103
104 /* Just find a expectation corresponding to a tuple. */
105 struct nf_conntrack_expect *
106 nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple)
107 {
108         struct nf_conntrack_expect *i;
109
110         rcu_read_lock();
111         i = __nf_ct_expect_find(net, tuple);
112         if (i && !atomic_inc_not_zero(&i->use))
113                 i = NULL;
114         rcu_read_unlock();
115
116         return i;
117 }
118 EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
119
120 /* If an expectation for this connection is found, it gets delete from
121  * global list then returned. */
122 struct nf_conntrack_expect *
123 nf_ct_find_expectation(struct net *net, const struct nf_conntrack_tuple *tuple)
124 {
125         struct nf_conntrack_expect *i, *exp = NULL;
126         struct hlist_node *n;
127         unsigned int h;
128
129         if (!net->ct.expect_count)
130                 return NULL;
131
132         h = nf_ct_expect_dst_hash(tuple);
133         hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
134                 if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
135                     nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
136                         exp = i;
137                         break;
138                 }
139         }
140         if (!exp)
141                 return NULL;
142
143         /* If master is not in hash table yet (ie. packet hasn't left
144            this machine yet), how can other end know about expected?
145            Hence these are not the droids you are looking for (if
146            master ct never got confirmed, we'd hold a reference to it
147            and weird things would happen to future packets). */
148         if (!nf_ct_is_confirmed(exp->master))
149                 return NULL;
150
151         if (exp->flags & NF_CT_EXPECT_PERMANENT) {
152                 atomic_inc(&exp->use);
153                 return exp;
154         } else if (del_timer(&exp->timeout)) {
155                 nf_ct_unlink_expect(exp);
156                 return exp;
157         }
158
159         return NULL;
160 }
161
162 /* delete all expectations for this conntrack */
163 void nf_ct_remove_expectations(struct nf_conn *ct)
164 {
165         struct nf_conn_help *help = nfct_help(ct);
166         struct nf_conntrack_expect *exp;
167         struct hlist_node *n, *next;
168
169         /* Optimization: most connection never expect any others. */
170         if (!help)
171                 return;
172
173         hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
174                 if (del_timer(&exp->timeout)) {
175                         nf_ct_unlink_expect(exp);
176                         nf_ct_expect_put(exp);
177                 }
178         }
179 }
180 EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
181
182 /* Would two expected things clash? */
183 static inline int expect_clash(const struct nf_conntrack_expect *a,
184                                const struct nf_conntrack_expect *b)
185 {
186         /* Part covered by intersection of masks must be unequal,
187            otherwise they clash */
188         struct nf_conntrack_tuple_mask intersect_mask;
189         int count;
190
191         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
192
193         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
194                 intersect_mask.src.u3.all[count] =
195                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
196         }
197
198         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
199 }
200
201 static inline int expect_matches(const struct nf_conntrack_expect *a,
202                                  const struct nf_conntrack_expect *b)
203 {
204         return a->master == b->master && a->class == b->class
205                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
206                 && nf_ct_tuple_mask_equal(&a->mask, &b->mask);
207 }
208
209 /* Generally a bad idea to call this: could have matched already. */
210 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
211 {
212         spin_lock_bh(&nf_conntrack_lock);
213         if (del_timer(&exp->timeout)) {
214                 nf_ct_unlink_expect(exp);
215                 nf_ct_expect_put(exp);
216         }
217         spin_unlock_bh(&nf_conntrack_lock);
218 }
219 EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
220
221 /* We don't increase the master conntrack refcount for non-fulfilled
222  * conntracks. During the conntrack destruction, the expectations are
223  * always killed before the conntrack itself */
224 struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
225 {
226         struct nf_conntrack_expect *new;
227
228         new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
229         if (!new)
230                 return NULL;
231
232         new->master = me;
233         atomic_set(&new->use, 1);
234         INIT_RCU_HEAD(&new->rcu);
235         return new;
236 }
237 EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
238
239 void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
240                        u_int8_t family,
241                        const union nf_inet_addr *saddr,
242                        const union nf_inet_addr *daddr,
243                        u_int8_t proto, const __be16 *src, const __be16 *dst)
244 {
245         int len;
246
247         if (family == AF_INET)
248                 len = 4;
249         else
250                 len = 16;
251
252         exp->flags = 0;
253         exp->class = class;
254         exp->expectfn = NULL;
255         exp->helper = NULL;
256         exp->tuple.src.l3num = family;
257         exp->tuple.dst.protonum = proto;
258
259         if (saddr) {
260                 memcpy(&exp->tuple.src.u3, saddr, len);
261                 if (sizeof(exp->tuple.src.u3) > len)
262                         /* address needs to be cleared for nf_ct_tuple_equal */
263                         memset((void *)&exp->tuple.src.u3 + len, 0x00,
264                                sizeof(exp->tuple.src.u3) - len);
265                 memset(&exp->mask.src.u3, 0xFF, len);
266                 if (sizeof(exp->mask.src.u3) > len)
267                         memset((void *)&exp->mask.src.u3 + len, 0x00,
268                                sizeof(exp->mask.src.u3) - len);
269         } else {
270                 memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
271                 memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
272         }
273
274         if (src) {
275                 exp->tuple.src.u.all = *src;
276                 exp->mask.src.u.all = htons(0xFFFF);
277         } else {
278                 exp->tuple.src.u.all = 0;
279                 exp->mask.src.u.all = 0;
280         }
281
282         memcpy(&exp->tuple.dst.u3, daddr, len);
283         if (sizeof(exp->tuple.dst.u3) > len)
284                 /* address needs to be cleared for nf_ct_tuple_equal */
285                 memset((void *)&exp->tuple.dst.u3 + len, 0x00,
286                        sizeof(exp->tuple.dst.u3) - len);
287
288         exp->tuple.dst.u.all = *dst;
289 }
290 EXPORT_SYMBOL_GPL(nf_ct_expect_init);
291
292 static void nf_ct_expect_free_rcu(struct rcu_head *head)
293 {
294         struct nf_conntrack_expect *exp;
295
296         exp = container_of(head, struct nf_conntrack_expect, rcu);
297         kmem_cache_free(nf_ct_expect_cachep, exp);
298 }
299
300 void nf_ct_expect_put(struct nf_conntrack_expect *exp)
301 {
302         if (atomic_dec_and_test(&exp->use))
303                 call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
304 }
305 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
306
307 static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
308 {
309         struct nf_conn_help *master_help = nfct_help(exp->master);
310         struct net *net = nf_ct_exp_net(exp);
311         const struct nf_conntrack_expect_policy *p;
312         unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
313
314         atomic_inc(&exp->use);
315
316         hlist_add_head(&exp->lnode, &master_help->expectations);
317         master_help->expecting[exp->class]++;
318
319         hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
320         net->ct.expect_count++;
321
322         setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
323                     (unsigned long)exp);
324         p = &master_help->helper->expect_policy[exp->class];
325         exp->timeout.expires = jiffies + p->timeout * HZ;
326         add_timer(&exp->timeout);
327
328         atomic_inc(&exp->use);
329         NF_CT_STAT_INC(net, expect_create);
330 }
331
332 /* Race with expectations being used means we could have none to find; OK. */
333 static void evict_oldest_expect(struct nf_conn *master,
334                                 struct nf_conntrack_expect *new)
335 {
336         struct nf_conn_help *master_help = nfct_help(master);
337         struct nf_conntrack_expect *exp, *last = NULL;
338         struct hlist_node *n;
339
340         hlist_for_each_entry(exp, n, &master_help->expectations, lnode) {
341                 if (exp->class == new->class)
342                         last = exp;
343         }
344
345         if (last && del_timer(&last->timeout)) {
346                 nf_ct_unlink_expect(last);
347                 nf_ct_expect_put(last);
348         }
349 }
350
351 static inline int refresh_timer(struct nf_conntrack_expect *i)
352 {
353         struct nf_conn_help *master_help = nfct_help(i->master);
354         const struct nf_conntrack_expect_policy *p;
355
356         if (!del_timer(&i->timeout))
357                 return 0;
358
359         p = &master_help->helper->expect_policy[i->class];
360         i->timeout.expires = jiffies + p->timeout * HZ;
361         add_timer(&i->timeout);
362         return 1;
363 }
364
365 static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
366 {
367         const struct nf_conntrack_expect_policy *p;
368         struct nf_conntrack_expect *i;
369         struct nf_conn *master = expect->master;
370         struct nf_conn_help *master_help = nfct_help(master);
371         struct net *net = nf_ct_exp_net(expect);
372         struct hlist_node *n;
373         unsigned int h;
374         int ret = 0;
375
376         if (!master_help->helper) {
377                 ret = -ESHUTDOWN;
378                 goto out;
379         }
380         h = nf_ct_expect_dst_hash(&expect->tuple);
381         hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
382                 if (expect_matches(i, expect)) {
383                         /* Refresh timer: if it's dying, ignore.. */
384                         if (refresh_timer(i)) {
385                                 ret = 0;
386                                 goto out;
387                         }
388                 } else if (expect_clash(i, expect)) {
389                         ret = -EBUSY;
390                         goto out;
391                 }
392         }
393         /* Will be over limit? */
394         p = &master_help->helper->expect_policy[expect->class];
395         if (p->max_expected &&
396             master_help->expecting[expect->class] >= p->max_expected) {
397                 evict_oldest_expect(master, expect);
398                 if (master_help->expecting[expect->class] >= p->max_expected) {
399                         ret = -EMFILE;
400                         goto out;
401                 }
402         }
403
404         if (net->ct.expect_count >= nf_ct_expect_max) {
405                 if (net_ratelimit())
406                         printk(KERN_WARNING
407                                "nf_conntrack: expectation table full\n");
408                 ret = -EMFILE;
409         }
410 out:
411         return ret;
412 }
413
414 int nf_ct_expect_related(struct nf_conntrack_expect *expect)
415 {
416         int ret;
417
418         spin_lock_bh(&nf_conntrack_lock);
419         ret = __nf_ct_expect_check(expect);
420         if (ret < 0)
421                 goto out;
422
423         nf_ct_expect_insert(expect);
424         atomic_inc(&expect->use);
425         spin_unlock_bh(&nf_conntrack_lock);
426         nf_ct_expect_event(IPEXP_NEW, expect);
427         nf_ct_expect_put(expect);
428         return ret;
429 out:
430         spin_unlock_bh(&nf_conntrack_lock);
431         return ret;
432 }
433 EXPORT_SYMBOL_GPL(nf_ct_expect_related);
434
435 int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
436                                 u32 pid, int report)
437 {
438         int ret;
439
440         spin_lock_bh(&nf_conntrack_lock);
441         ret = __nf_ct_expect_check(expect);
442         if (ret < 0)
443                 goto out;
444         nf_ct_expect_insert(expect);
445 out:
446         spin_unlock_bh(&nf_conntrack_lock);
447         if (ret == 0)
448                 nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
449         return ret;
450 }
451 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
452
453 #ifdef CONFIG_PROC_FS
454 struct ct_expect_iter_state {
455         struct seq_net_private p;
456         unsigned int bucket;
457 };
458
459 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
460 {
461         struct net *net = seq_file_net(seq);
462         struct ct_expect_iter_state *st = seq->private;
463         struct hlist_node *n;
464
465         for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
466                 n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
467                 if (n)
468                         return n;
469         }
470         return NULL;
471 }
472
473 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
474                                              struct hlist_node *head)
475 {
476         struct net *net = seq_file_net(seq);
477         struct ct_expect_iter_state *st = seq->private;
478
479         head = rcu_dereference(head->next);
480         while (head == NULL) {
481                 if (++st->bucket >= nf_ct_expect_hsize)
482                         return NULL;
483                 head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
484         }
485         return head;
486 }
487
488 static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
489 {
490         struct hlist_node *head = ct_expect_get_first(seq);
491
492         if (head)
493                 while (pos && (head = ct_expect_get_next(seq, head)))
494                         pos--;
495         return pos ? NULL : head;
496 }
497
498 static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
499         __acquires(RCU)
500 {
501         rcu_read_lock();
502         return ct_expect_get_idx(seq, *pos);
503 }
504
505 static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
506 {
507         (*pos)++;
508         return ct_expect_get_next(seq, v);
509 }
510
511 static void exp_seq_stop(struct seq_file *seq, void *v)
512         __releases(RCU)
513 {
514         rcu_read_unlock();
515 }
516
517 static int exp_seq_show(struct seq_file *s, void *v)
518 {
519         struct nf_conntrack_expect *expect;
520         struct hlist_node *n = v;
521         char *delim = "";
522
523         expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
524
525         if (expect->timeout.function)
526                 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
527                            ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
528         else
529                 seq_printf(s, "- ");
530         seq_printf(s, "l3proto = %u proto=%u ",
531                    expect->tuple.src.l3num,
532                    expect->tuple.dst.protonum);
533         print_tuple(s, &expect->tuple,
534                     __nf_ct_l3proto_find(expect->tuple.src.l3num),
535                     __nf_ct_l4proto_find(expect->tuple.src.l3num,
536                                        expect->tuple.dst.protonum));
537
538         if (expect->flags & NF_CT_EXPECT_PERMANENT) {
539                 seq_printf(s, "PERMANENT");
540                 delim = ",";
541         }
542         if (expect->flags & NF_CT_EXPECT_INACTIVE)
543                 seq_printf(s, "%sINACTIVE", delim);
544
545         return seq_putc(s, '\n');
546 }
547
548 static const struct seq_operations exp_seq_ops = {
549         .start = exp_seq_start,
550         .next = exp_seq_next,
551         .stop = exp_seq_stop,
552         .show = exp_seq_show
553 };
554
555 static int exp_open(struct inode *inode, struct file *file)
556 {
557         return seq_open_net(inode, file, &exp_seq_ops,
558                         sizeof(struct ct_expect_iter_state));
559 }
560
561 static const struct file_operations exp_file_ops = {
562         .owner   = THIS_MODULE,
563         .open    = exp_open,
564         .read    = seq_read,
565         .llseek  = seq_lseek,
566         .release = seq_release_net,
567 };
568 #endif /* CONFIG_PROC_FS */
569
570 static int exp_proc_init(struct net *net)
571 {
572 #ifdef CONFIG_PROC_FS
573         struct proc_dir_entry *proc;
574
575         proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops);
576         if (!proc)
577                 return -ENOMEM;
578 #endif /* CONFIG_PROC_FS */
579         return 0;
580 }
581
582 static void exp_proc_remove(struct net *net)
583 {
584 #ifdef CONFIG_PROC_FS
585         proc_net_remove(net, "nf_conntrack_expect");
586 #endif /* CONFIG_PROC_FS */
587 }
588
589 module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600);
590
591 int nf_conntrack_expect_init(struct net *net)
592 {
593         int err = -ENOMEM;
594
595         if (net_eq(net, &init_net)) {
596                 if (!nf_ct_expect_hsize) {
597                         nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
598                         if (!nf_ct_expect_hsize)
599                                 nf_ct_expect_hsize = 1;
600                 }
601                 nf_ct_expect_max = nf_ct_expect_hsize * 4;
602         }
603
604         net->ct.expect_count = 0;
605         net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
606                                                   &net->ct.expect_vmalloc);
607         if (net->ct.expect_hash == NULL)
608                 goto err1;
609
610         if (net_eq(net, &init_net)) {
611                 nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
612                                         sizeof(struct nf_conntrack_expect),
613                                         0, 0, NULL);
614                 if (!nf_ct_expect_cachep)
615                         goto err2;
616         }
617
618         err = exp_proc_init(net);
619         if (err < 0)
620                 goto err3;
621
622         return 0;
623
624 err3:
625         if (net_eq(net, &init_net))
626                 kmem_cache_destroy(nf_ct_expect_cachep);
627 err2:
628         nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
629                              nf_ct_expect_hsize);
630 err1:
631         return err;
632 }
633
634 void nf_conntrack_expect_fini(struct net *net)
635 {
636         exp_proc_remove(net);
637         if (net_eq(net, &init_net))
638                 kmem_cache_destroy(nf_ct_expect_cachep);
639         nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc,
640                              nf_ct_expect_hsize);
641 }