[NETFILTER]: ip_conntrack: properly use RCU API for ip_ct_protos array
[linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48 #define IP_CONNTRACK_VERSION    "2.4"
49
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
55
56 DEFINE_RWLOCK(ip_conntrack_lock);
57
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
74
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83 /* deliver cached events and clear cache entry - must be called with locally
84  * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91                                     ecache->ct);
92         ecache->events = 0;
93         ip_conntrack_put(ecache->ct);
94         ecache->ct = NULL;
95 }
96
97 /* Deliver all cached events for a particular conntrack. This is called
98  * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100 {
101         struct ip_conntrack_ecache *ecache;
102
103         local_bh_disable();
104         ecache = &__get_cpu_var(ip_conntrack_ecache);
105         if (ecache->ct == ct)
106                 __ip_ct_deliver_cached_events(ecache);
107         local_bh_enable();
108 }
109
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111 {
112         struct ip_conntrack_ecache *ecache;
113
114         /* take care of delivering potentially old events */
115         ecache = &__get_cpu_var(ip_conntrack_ecache);
116         BUG_ON(ecache->ct == ct);
117         if (ecache->ct)
118                 __ip_ct_deliver_cached_events(ecache);
119         /* initialize for this conntrack/packet */
120         ecache->ct = ct;
121         nf_conntrack_get(&ct->ct_general);
122 }
123
124 /* flush the event cache - touches other CPU's data and must not be called while
125  * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
127 {
128         struct ip_conntrack_ecache *ecache;
129         int cpu;
130
131         for_each_possible_cpu(cpu) {
132                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133                 if (ecache->ct)
134                         ip_conntrack_put(ecache->ct);
135         }
136 }
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
145
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147                             unsigned int size, unsigned int rnd)
148 {
149         return (jhash_3words((__force u32)tuple->src.ip,
150                              ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
152                              rnd) % size);
153 }
154
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
157 {
158         return __hash_conntrack(tuple, ip_conntrack_htable_size,
159                                 ip_conntrack_hash_rnd);
160 }
161
162 int
163 ip_ct_get_tuple(const struct iphdr *iph,
164                 const struct sk_buff *skb,
165                 unsigned int dataoff,
166                 struct ip_conntrack_tuple *tuple,
167                 const struct ip_conntrack_protocol *protocol)
168 {
169         /* Never happen */
170         if (iph->frag_off & htons(IP_OFFSET)) {
171                 printk("ip_conntrack_core: Frag of proto %u.\n",
172                        iph->protocol);
173                 return 0;
174         }
175
176         tuple->src.ip = iph->saddr;
177         tuple->dst.ip = iph->daddr;
178         tuple->dst.protonum = iph->protocol;
179         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181         return protocol->pkt_to_tuple(skb, dataoff, tuple);
182 }
183
184 int
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186                    const struct ip_conntrack_tuple *orig,
187                    const struct ip_conntrack_protocol *protocol)
188 {
189         inverse->src.ip = orig->dst.ip;
190         inverse->dst.ip = orig->src.ip;
191         inverse->dst.protonum = orig->dst.protonum;
192         inverse->dst.dir = !orig->dst.dir;
193
194         return protocol->invert_tuple(inverse, orig);
195 }
196
197
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200 {
201         IP_NF_ASSERT(!timer_pending(&exp->timeout));
202         list_del(&exp->list);
203         CONNTRACK_STAT_INC(expect_delete);
204         exp->master->expecting--;
205         ip_conntrack_expect_put(exp);
206 }
207
208 static void expectation_timed_out(unsigned long ul_expect)
209 {
210         struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212         write_lock_bh(&ip_conntrack_lock);
213         ip_ct_unlink_expect(exp);
214         write_unlock_bh(&ip_conntrack_lock);
215         ip_conntrack_expect_put(exp);
216 }
217
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220 {
221         struct ip_conntrack_expect *i;
222
223         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225                         return i;
226         }
227         return NULL;
228 }
229
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233 {
234         struct ip_conntrack_expect *i;
235
236         read_lock_bh(&ip_conntrack_lock);
237         i = __ip_conntrack_expect_find(tuple);
238         if (i)
239                 atomic_inc(&i->use);
240         read_unlock_bh(&ip_conntrack_lock);
241
242         return i;
243 }
244
245 /* If an expectation for this connection is found, it gets delete from
246  * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
249 {
250         struct ip_conntrack_expect *i;
251
252         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253                 /* If master is not in hash table yet (ie. packet hasn't left
254                    this machine yet), how can other end know about expected?
255                    Hence these are not the droids you are looking for (if
256                    master ct never got confirmed, we'd hold a reference to it
257                    and weird things would happen to future packets). */
258                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259                     && is_confirmed(i->master)) {
260                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
261                                 atomic_inc(&i->use);
262                                 return i;
263                         } else if (del_timer(&i->timeout)) {
264                                 ip_ct_unlink_expect(i);
265                                 return i;
266                         }
267                 }
268         }
269         return NULL;
270 }
271
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
274 {
275         struct ip_conntrack_expect *i, *tmp;
276
277         /* Optimization: most connection never expect any others. */
278         if (ct->expecting == 0)
279                 return;
280
281         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282                 if (i->master == ct && del_timer(&i->timeout)) {
283                         ip_ct_unlink_expect(i);
284                         ip_conntrack_expect_put(i);
285                 }
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         DEBUGP("clean_from_lists(%p)\n", ct);
293         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296         /* Destroy all pending expectations */
297         ip_ct_remove_expectations(ct);
298 }
299
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
302 {
303         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304         struct ip_conntrack_protocol *proto;
305         struct ip_conntrack_helper *helper;
306
307         DEBUGP("destroy_conntrack(%p)\n", ct);
308         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
309         IP_NF_ASSERT(!timer_pending(&ct->timeout));
310
311         ip_conntrack_event(IPCT_DESTROY, ct);
312         set_bit(IPS_DYING_BIT, &ct->status);
313
314         helper = ct->helper;
315         if (helper && helper->destroy)
316                 helper->destroy(ct);
317
318         /* To make sure we don't get any weird locking issues here:
319          * destroy_conntrack() MUST NOT be called with a write lock
320          * to ip_conntrack_lock!!! -HW */
321         rcu_read_lock();
322         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
323         if (proto && proto->destroy)
324                 proto->destroy(ct);
325         rcu_read_unlock();
326
327         if (ip_conntrack_destroyed)
328                 ip_conntrack_destroyed(ct);
329
330         write_lock_bh(&ip_conntrack_lock);
331         /* Expectations will have been removed in clean_from_lists,
332          * except TFTP can create an expectation on the first packet,
333          * before connection is in the list, so we need to clean here,
334          * too. */
335         ip_ct_remove_expectations(ct);
336
337         /* We overload first tuple to link into unconfirmed list. */
338         if (!is_confirmed(ct)) {
339                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
340                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
341         }
342
343         CONNTRACK_STAT_INC(delete);
344         write_unlock_bh(&ip_conntrack_lock);
345
346         if (ct->master)
347                 ip_conntrack_put(ct->master);
348
349         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
350         ip_conntrack_free(ct);
351 }
352
353 static void death_by_timeout(unsigned long ul_conntrack)
354 {
355         struct ip_conntrack *ct = (void *)ul_conntrack;
356
357         write_lock_bh(&ip_conntrack_lock);
358         /* Inside lock so preempt is disabled on module removal path.
359          * Otherwise we can get spurious warnings. */
360         CONNTRACK_STAT_INC(delete_list);
361         clean_from_lists(ct);
362         write_unlock_bh(&ip_conntrack_lock);
363         ip_conntrack_put(ct);
364 }
365
366 struct ip_conntrack_tuple_hash *
367 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
368                     const struct ip_conntrack *ignored_conntrack)
369 {
370         struct ip_conntrack_tuple_hash *h;
371         unsigned int hash = hash_conntrack(tuple);
372
373         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
374                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
375                     ip_ct_tuple_equal(tuple, &h->tuple)) {
376                         CONNTRACK_STAT_INC(found);
377                         return h;
378                 }
379                 CONNTRACK_STAT_INC(searched);
380         }
381
382         return NULL;
383 }
384
385 /* Find a connection corresponding to a tuple. */
386 struct ip_conntrack_tuple_hash *
387 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
388                       const struct ip_conntrack *ignored_conntrack)
389 {
390         struct ip_conntrack_tuple_hash *h;
391
392         read_lock_bh(&ip_conntrack_lock);
393         h = __ip_conntrack_find(tuple, ignored_conntrack);
394         if (h)
395                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
396         read_unlock_bh(&ip_conntrack_lock);
397
398         return h;
399 }
400
401 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
402                                         unsigned int hash,
403                                         unsigned int repl_hash)
404 {
405         ct->id = ++ip_conntrack_next_id;
406         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
407                  &ip_conntrack_hash[hash]);
408         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
409                  &ip_conntrack_hash[repl_hash]);
410 }
411
412 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
413 {
414         unsigned int hash, repl_hash;
415
416         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
417         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
418
419         write_lock_bh(&ip_conntrack_lock);
420         __ip_conntrack_hash_insert(ct, hash, repl_hash);
421         write_unlock_bh(&ip_conntrack_lock);
422 }
423
424 /* Confirm a connection given skb; places it in hash table */
425 int
426 __ip_conntrack_confirm(struct sk_buff **pskb)
427 {
428         unsigned int hash, repl_hash;
429         struct ip_conntrack_tuple_hash *h;
430         struct ip_conntrack *ct;
431         enum ip_conntrack_info ctinfo;
432
433         ct = ip_conntrack_get(*pskb, &ctinfo);
434
435         /* ipt_REJECT uses ip_conntrack_attach to attach related
436            ICMP/TCP RST packets in other direction.  Actual packet
437            which created connection will be IP_CT_NEW or for an
438            expected connection, IP_CT_RELATED. */
439         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
440                 return NF_ACCEPT;
441
442         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
443         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
444
445         /* We're not in hash table, and we refuse to set up related
446            connections for unconfirmed conns.  But packet copies and
447            REJECT will give spurious warnings here. */
448         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
449
450         /* No external references means noone else could have
451            confirmed us. */
452         IP_NF_ASSERT(!is_confirmed(ct));
453         DEBUGP("Confirming conntrack %p\n", ct);
454
455         write_lock_bh(&ip_conntrack_lock);
456
457         /* See if there's one in the list already, including reverse:
458            NAT could have grabbed it without realizing, since we're
459            not in the hash.  If there is, we lost race. */
460         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
461                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
462                                       &h->tuple))
463                         goto out;
464         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
465                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
466                                       &h->tuple))
467                         goto out;
468
469         /* Remove from unconfirmed list */
470         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
471
472         __ip_conntrack_hash_insert(ct, hash, repl_hash);
473         /* Timer relative to confirmation time, not original
474            setting time, otherwise we'd get timer wrap in
475            weird delay cases. */
476         ct->timeout.expires += jiffies;
477         add_timer(&ct->timeout);
478         atomic_inc(&ct->ct_general.use);
479         set_bit(IPS_CONFIRMED_BIT, &ct->status);
480         CONNTRACK_STAT_INC(insert);
481         write_unlock_bh(&ip_conntrack_lock);
482         if (ct->helper)
483                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
484 #ifdef CONFIG_IP_NF_NAT_NEEDED
485         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
486             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
487                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
488 #endif
489         ip_conntrack_event_cache(master_ct(ct) ?
490                                  IPCT_RELATED : IPCT_NEW, *pskb);
491
492         return NF_ACCEPT;
493
494 out:
495         CONNTRACK_STAT_INC(insert_failed);
496         write_unlock_bh(&ip_conntrack_lock);
497         return NF_DROP;
498 }
499
500 /* Returns true if a connection correspondings to the tuple (required
501    for NAT). */
502 int
503 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
504                          const struct ip_conntrack *ignored_conntrack)
505 {
506         struct ip_conntrack_tuple_hash *h;
507
508         read_lock_bh(&ip_conntrack_lock);
509         h = __ip_conntrack_find(tuple, ignored_conntrack);
510         read_unlock_bh(&ip_conntrack_lock);
511
512         return h != NULL;
513 }
514
515 /* There's a small race here where we may free a just-assured
516    connection.  Too bad: we're in trouble anyway. */
517 static int early_drop(struct list_head *chain)
518 {
519         /* Traverse backwards: gives us oldest, which is roughly LRU */
520         struct ip_conntrack_tuple_hash *h;
521         struct ip_conntrack *ct = NULL, *tmp;
522         int dropped = 0;
523
524         read_lock_bh(&ip_conntrack_lock);
525         list_for_each_entry_reverse(h, chain, list) {
526                 tmp = tuplehash_to_ctrack(h);
527                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
528                         ct = tmp;
529                         atomic_inc(&ct->ct_general.use);
530                         break;
531                 }
532         }
533         read_unlock_bh(&ip_conntrack_lock);
534
535         if (!ct)
536                 return dropped;
537
538         if (del_timer(&ct->timeout)) {
539                 death_by_timeout((unsigned long)ct);
540                 dropped = 1;
541                 CONNTRACK_STAT_INC(early_drop);
542         }
543         ip_conntrack_put(ct);
544         return dropped;
545 }
546
547 static struct ip_conntrack_helper *
548 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
549 {
550         struct ip_conntrack_helper *h;
551
552         list_for_each_entry(h, &helpers, list) {
553                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
554                         return h;
555         }
556         return NULL;
557 }
558
559 struct ip_conntrack_helper *
560 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
561 {
562         struct ip_conntrack_helper *helper;
563
564         /* need ip_conntrack_lock to assure that helper exists until
565          * try_module_get() is called */
566         read_lock_bh(&ip_conntrack_lock);
567
568         helper = __ip_conntrack_helper_find(tuple);
569         if (helper) {
570                 /* need to increase module usage count to assure helper will
571                  * not go away while the caller is e.g. busy putting a
572                  * conntrack in the hash that uses the helper */
573                 if (!try_module_get(helper->me))
574                         helper = NULL;
575         }
576
577         read_unlock_bh(&ip_conntrack_lock);
578
579         return helper;
580 }
581
582 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
583 {
584         module_put(helper->me);
585 }
586
587 struct ip_conntrack_protocol *
588 __ip_conntrack_proto_find(u_int8_t protocol)
589 {
590         return ip_ct_protos[protocol];
591 }
592
593 /* this is guaranteed to always return a valid protocol helper, since
594  * it falls back to generic_protocol */
595 struct ip_conntrack_protocol *
596 ip_conntrack_proto_find_get(u_int8_t protocol)
597 {
598         struct ip_conntrack_protocol *p;
599
600         rcu_read_lock();
601         p = __ip_conntrack_proto_find(protocol);
602         if (p) {
603                 if (!try_module_get(p->me))
604                         p = &ip_conntrack_generic_protocol;
605         }
606         rcu_read_unlock();
607
608         return p;
609 }
610
611 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
612 {
613         module_put(p->me);
614 }
615
616 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
617                                         struct ip_conntrack_tuple *repl)
618 {
619         struct ip_conntrack *conntrack;
620
621         if (!ip_conntrack_hash_rnd_initted) {
622                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
623                 ip_conntrack_hash_rnd_initted = 1;
624         }
625
626         /* We don't want any race condition at early drop stage */
627         atomic_inc(&ip_conntrack_count);
628
629         if (ip_conntrack_max
630             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
631                 unsigned int hash = hash_conntrack(orig);
632                 /* Try dropping from this hash chain. */
633                 if (!early_drop(&ip_conntrack_hash[hash])) {
634                         atomic_dec(&ip_conntrack_count);
635                         if (net_ratelimit())
636                                 printk(KERN_WARNING
637                                        "ip_conntrack: table full, dropping"
638                                        " packet.\n");
639                         return ERR_PTR(-ENOMEM);
640                 }
641         }
642
643         conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
644         if (!conntrack) {
645                 DEBUGP("Can't allocate conntrack.\n");
646                 atomic_dec(&ip_conntrack_count);
647                 return ERR_PTR(-ENOMEM);
648         }
649
650         atomic_set(&conntrack->ct_general.use, 1);
651         conntrack->ct_general.destroy = destroy_conntrack;
652         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
653         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
654         /* Don't set timer yet: wait for confirmation */
655         init_timer(&conntrack->timeout);
656         conntrack->timeout.data = (unsigned long)conntrack;
657         conntrack->timeout.function = death_by_timeout;
658
659         return conntrack;
660 }
661
662 void
663 ip_conntrack_free(struct ip_conntrack *conntrack)
664 {
665         atomic_dec(&ip_conntrack_count);
666         kmem_cache_free(ip_conntrack_cachep, conntrack);
667 }
668
669 /* Allocate a new conntrack: we return -ENOMEM if classification
670  * failed due to stress.   Otherwise it really is unclassifiable */
671 static struct ip_conntrack_tuple_hash *
672 init_conntrack(struct ip_conntrack_tuple *tuple,
673                struct ip_conntrack_protocol *protocol,
674                struct sk_buff *skb)
675 {
676         struct ip_conntrack *conntrack;
677         struct ip_conntrack_tuple repl_tuple;
678         struct ip_conntrack_expect *exp;
679
680         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
681                 DEBUGP("Can't invert tuple.\n");
682                 return NULL;
683         }
684
685         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
686         if (conntrack == NULL || IS_ERR(conntrack))
687                 return (struct ip_conntrack_tuple_hash *)conntrack;
688
689         if (!protocol->new(conntrack, skb)) {
690                 ip_conntrack_free(conntrack);
691                 return NULL;
692         }
693
694         write_lock_bh(&ip_conntrack_lock);
695         exp = find_expectation(tuple);
696
697         if (exp) {
698                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
699                         conntrack, exp);
700                 /* Welcome, Mr. Bond.  We've been expecting you... */
701                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
702                 conntrack->master = exp->master;
703 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
704                 conntrack->mark = exp->master->mark;
705 #endif
706 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
707     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
708                 /* this is ugly, but there is no other place where to put it */
709                 conntrack->nat.masq_index = exp->master->nat.masq_index;
710 #endif
711 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
712                 conntrack->secmark = exp->master->secmark;
713 #endif
714                 nf_conntrack_get(&conntrack->master->ct_general);
715                 CONNTRACK_STAT_INC(expect_new);
716         } else {
717                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
718
719                 CONNTRACK_STAT_INC(new);
720         }
721
722         /* Overload tuple linked list to put us in unconfirmed list. */
723         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
724
725         write_unlock_bh(&ip_conntrack_lock);
726
727         if (exp) {
728                 if (exp->expectfn)
729                         exp->expectfn(conntrack, exp);
730                 ip_conntrack_expect_put(exp);
731         }
732
733         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
734 }
735
736 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
737 static inline struct ip_conntrack *
738 resolve_normal_ct(struct sk_buff *skb,
739                   struct ip_conntrack_protocol *proto,
740                   int *set_reply,
741                   unsigned int hooknum,
742                   enum ip_conntrack_info *ctinfo)
743 {
744         struct ip_conntrack_tuple tuple;
745         struct ip_conntrack_tuple_hash *h;
746         struct ip_conntrack *ct;
747
748         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
749
750         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
751                                 &tuple,proto))
752                 return NULL;
753
754         /* look for tuple match */
755         h = ip_conntrack_find_get(&tuple, NULL);
756         if (!h) {
757                 h = init_conntrack(&tuple, proto, skb);
758                 if (!h)
759                         return NULL;
760                 if (IS_ERR(h))
761                         return (void *)h;
762         }
763         ct = tuplehash_to_ctrack(h);
764
765         /* It exists; we have (non-exclusive) reference. */
766         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
767                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
768                 /* Please set reply bit if this packet OK */
769                 *set_reply = 1;
770         } else {
771                 /* Once we've had two way comms, always ESTABLISHED. */
772                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
773                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
774                                ct);
775                         *ctinfo = IP_CT_ESTABLISHED;
776                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
777                         DEBUGP("ip_conntrack_in: related packet for %p\n",
778                                ct);
779                         *ctinfo = IP_CT_RELATED;
780                 } else {
781                         DEBUGP("ip_conntrack_in: new packet for %p\n",
782                                ct);
783                         *ctinfo = IP_CT_NEW;
784                 }
785                 *set_reply = 0;
786         }
787         skb->nfct = &ct->ct_general;
788         skb->nfctinfo = *ctinfo;
789         return ct;
790 }
791
792 /* Netfilter hook itself. */
793 unsigned int ip_conntrack_in(unsigned int hooknum,
794                              struct sk_buff **pskb,
795                              const struct net_device *in,
796                              const struct net_device *out,
797                              int (*okfn)(struct sk_buff *))
798 {
799         struct ip_conntrack *ct;
800         enum ip_conntrack_info ctinfo;
801         struct ip_conntrack_protocol *proto;
802         int set_reply = 0;
803         int ret;
804
805         /* Previously seen (loopback or untracked)?  Ignore. */
806         if ((*pskb)->nfct) {
807                 CONNTRACK_STAT_INC(ignore);
808                 return NF_ACCEPT;
809         }
810
811         /* Never happen */
812         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
813                 if (net_ratelimit()) {
814                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
815                        (*pskb)->nh.iph->protocol, hooknum);
816                 }
817                 return NF_DROP;
818         }
819
820 /* Doesn't cover locally-generated broadcast, so not worth it. */
821 #if 0
822         /* Ignore broadcast: no `connection'. */
823         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
824                 printk("Broadcast packet!\n");
825                 return NF_ACCEPT;
826         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
827                    == htonl(0x000000FF)) {
828                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
829                        NIPQUAD((*pskb)->nh.iph->saddr),
830                        NIPQUAD((*pskb)->nh.iph->daddr),
831                        (*pskb)->sk, (*pskb)->pkt_type);
832         }
833 #endif
834
835         /* rcu_read_lock()ed by nf_hook_slow */
836         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
837
838         /* It may be an special packet, error, unclean...
839          * inverse of the return code tells to the netfilter
840          * core what to do with the packet. */
841         if (proto->error != NULL
842             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
843                 CONNTRACK_STAT_INC(error);
844                 CONNTRACK_STAT_INC(invalid);
845                 return -ret;
846         }
847
848         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
849                 /* Not valid part of a connection */
850                 CONNTRACK_STAT_INC(invalid);
851                 return NF_ACCEPT;
852         }
853
854         if (IS_ERR(ct)) {
855                 /* Too stressed to deal. */
856                 CONNTRACK_STAT_INC(drop);
857                 return NF_DROP;
858         }
859
860         IP_NF_ASSERT((*pskb)->nfct);
861
862         ret = proto->packet(ct, *pskb, ctinfo);
863         if (ret < 0) {
864                 /* Invalid: inverse of the return code tells
865                  * the netfilter core what to do*/
866                 nf_conntrack_put((*pskb)->nfct);
867                 (*pskb)->nfct = NULL;
868                 CONNTRACK_STAT_INC(invalid);
869                 return -ret;
870         }
871
872         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
873                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
874
875         return ret;
876 }
877
878 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
879                    const struct ip_conntrack_tuple *orig)
880 {
881         struct ip_conntrack_protocol *proto;
882         int ret;
883
884         rcu_read_lock();
885         proto = __ip_conntrack_proto_find(orig->dst.protonum);
886         ret = ip_ct_invert_tuple(inverse, orig, proto);
887         rcu_read_unlock();
888
889         return ret;
890 }
891
892 /* Would two expected things clash? */
893 static inline int expect_clash(const struct ip_conntrack_expect *a,
894                                const struct ip_conntrack_expect *b)
895 {
896         /* Part covered by intersection of masks must be unequal,
897            otherwise they clash */
898         struct ip_conntrack_tuple intersect_mask
899                 = { { a->mask.src.ip & b->mask.src.ip,
900                       { a->mask.src.u.all & b->mask.src.u.all } },
901                     { a->mask.dst.ip & b->mask.dst.ip,
902                       { a->mask.dst.u.all & b->mask.dst.u.all },
903                       a->mask.dst.protonum & b->mask.dst.protonum } };
904
905         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
906 }
907
908 static inline int expect_matches(const struct ip_conntrack_expect *a,
909                                  const struct ip_conntrack_expect *b)
910 {
911         return a->master == b->master
912                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
913                 && ip_ct_tuple_equal(&a->mask, &b->mask);
914 }
915
916 /* Generally a bad idea to call this: could have matched already. */
917 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
918 {
919         struct ip_conntrack_expect *i;
920
921         write_lock_bh(&ip_conntrack_lock);
922         /* choose the the oldest expectation to evict */
923         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
924                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
925                         ip_ct_unlink_expect(i);
926                         write_unlock_bh(&ip_conntrack_lock);
927                         ip_conntrack_expect_put(i);
928                         return;
929                 }
930         }
931         write_unlock_bh(&ip_conntrack_lock);
932 }
933
934 /* We don't increase the master conntrack refcount for non-fulfilled
935  * conntracks. During the conntrack destruction, the expectations are
936  * always killed before the conntrack itself */
937 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
938 {
939         struct ip_conntrack_expect *new;
940
941         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
942         if (!new) {
943                 DEBUGP("expect_related: OOM allocating expect\n");
944                 return NULL;
945         }
946         new->master = me;
947         atomic_set(&new->use, 1);
948         return new;
949 }
950
951 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
952 {
953         if (atomic_dec_and_test(&exp->use))
954                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
955 }
956
957 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
958 {
959         atomic_inc(&exp->use);
960         exp->master->expecting++;
961         list_add(&exp->list, &ip_conntrack_expect_list);
962
963         init_timer(&exp->timeout);
964         exp->timeout.data = (unsigned long)exp;
965         exp->timeout.function = expectation_timed_out;
966         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
967         add_timer(&exp->timeout);
968
969         exp->id = ++ip_conntrack_expect_next_id;
970         atomic_inc(&exp->use);
971         CONNTRACK_STAT_INC(expect_create);
972 }
973
974 /* Race with expectations being used means we could have none to find; OK. */
975 static void evict_oldest_expect(struct ip_conntrack *master)
976 {
977         struct ip_conntrack_expect *i;
978
979         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
980                 if (i->master == master) {
981                         if (del_timer(&i->timeout)) {
982                                 ip_ct_unlink_expect(i);
983                                 ip_conntrack_expect_put(i);
984                         }
985                         break;
986                 }
987         }
988 }
989
990 static inline int refresh_timer(struct ip_conntrack_expect *i)
991 {
992         if (!del_timer(&i->timeout))
993                 return 0;
994
995         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
996         add_timer(&i->timeout);
997         return 1;
998 }
999
1000 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1001 {
1002         struct ip_conntrack_expect *i;
1003         int ret;
1004
1005         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1006         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1007         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1008
1009         write_lock_bh(&ip_conntrack_lock);
1010         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1011                 if (expect_matches(i, expect)) {
1012                         /* Refresh timer: if it's dying, ignore.. */
1013                         if (refresh_timer(i)) {
1014                                 ret = 0;
1015                                 goto out;
1016                         }
1017                 } else if (expect_clash(i, expect)) {
1018                         ret = -EBUSY;
1019                         goto out;
1020                 }
1021         }
1022
1023         /* Will be over limit? */
1024         if (expect->master->helper->max_expected &&
1025             expect->master->expecting >= expect->master->helper->max_expected)
1026                 evict_oldest_expect(expect->master);
1027
1028         ip_conntrack_expect_insert(expect);
1029         ip_conntrack_expect_event(IPEXP_NEW, expect);
1030         ret = 0;
1031 out:
1032         write_unlock_bh(&ip_conntrack_lock);
1033         return ret;
1034 }
1035
1036 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1037    implicitly racy: see __ip_conntrack_confirm */
1038 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1039                               const struct ip_conntrack_tuple *newreply)
1040 {
1041         write_lock_bh(&ip_conntrack_lock);
1042         /* Should be unconfirmed, so not in hash table yet */
1043         IP_NF_ASSERT(!is_confirmed(conntrack));
1044
1045         DEBUGP("Altering reply tuple of %p to ", conntrack);
1046         DUMP_TUPLE(newreply);
1047
1048         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1049         if (!conntrack->master && conntrack->expecting == 0)
1050                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1051         write_unlock_bh(&ip_conntrack_lock);
1052 }
1053
1054 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1055 {
1056         BUG_ON(me->timeout == 0);
1057         write_lock_bh(&ip_conntrack_lock);
1058         list_add(&me->list, &helpers);
1059         write_unlock_bh(&ip_conntrack_lock);
1060
1061         return 0;
1062 }
1063
1064 struct ip_conntrack_helper *
1065 __ip_conntrack_helper_find_byname(const char *name)
1066 {
1067         struct ip_conntrack_helper *h;
1068
1069         list_for_each_entry(h, &helpers, list) {
1070                 if (!strcmp(h->name, name))
1071                         return h;
1072         }
1073
1074         return NULL;
1075 }
1076
1077 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1078                           const struct ip_conntrack_helper *me)
1079 {
1080         if (tuplehash_to_ctrack(i)->helper == me) {
1081                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1082                 tuplehash_to_ctrack(i)->helper = NULL;
1083         }
1084 }
1085
1086 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1087 {
1088         unsigned int i;
1089         struct ip_conntrack_tuple_hash *h;
1090         struct ip_conntrack_expect *exp, *tmp;
1091
1092         /* Need write lock here, to delete helper. */
1093         write_lock_bh(&ip_conntrack_lock);
1094         list_del(&me->list);
1095
1096         /* Get rid of expectations */
1097         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1098                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1099                         ip_ct_unlink_expect(exp);
1100                         ip_conntrack_expect_put(exp);
1101                 }
1102         }
1103         /* Get rid of expecteds, set helpers to NULL. */
1104         list_for_each_entry(h, &unconfirmed, list)
1105                 unhelp(h, me);
1106         for (i = 0; i < ip_conntrack_htable_size; i++) {
1107                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1108                         unhelp(h, me);
1109         }
1110         write_unlock_bh(&ip_conntrack_lock);
1111
1112         /* Someone could be still looking at the helper in a bh. */
1113         synchronize_net();
1114 }
1115
1116 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1117 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1118                         enum ip_conntrack_info ctinfo,
1119                         const struct sk_buff *skb,
1120                         unsigned long extra_jiffies,
1121                         int do_acct)
1122 {
1123         int event = 0;
1124
1125         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1126         IP_NF_ASSERT(skb);
1127
1128         write_lock_bh(&ip_conntrack_lock);
1129
1130         /* Only update if this is not a fixed timeout */
1131         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1132                 write_unlock_bh(&ip_conntrack_lock);
1133                 return;
1134         }
1135
1136         /* If not in hash table, timer will not be active yet */
1137         if (!is_confirmed(ct)) {
1138                 ct->timeout.expires = extra_jiffies;
1139                 event = IPCT_REFRESH;
1140         } else {
1141                 /* Need del_timer for race avoidance (may already be dying). */
1142                 if (del_timer(&ct->timeout)) {
1143                         ct->timeout.expires = jiffies + extra_jiffies;
1144                         add_timer(&ct->timeout);
1145                         event = IPCT_REFRESH;
1146                 }
1147         }
1148
1149 #ifdef CONFIG_IP_NF_CT_ACCT
1150         if (do_acct) {
1151                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1152                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1153                                                 ntohs(skb->nh.iph->tot_len);
1154                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1155                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1156                         event |= IPCT_COUNTER_FILLING;
1157         }
1158 #endif
1159
1160         write_unlock_bh(&ip_conntrack_lock);
1161
1162         /* must be unlocked when calling event cache */
1163         if (event)
1164                 ip_conntrack_event_cache(event, skb);
1165 }
1166
1167 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1168     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1169 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1170  * in ip_conntrack_core, since we don't want the protocols to autoload
1171  * or depend on ctnetlink */
1172 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1173                                const struct ip_conntrack_tuple *tuple)
1174 {
1175         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1176                 &tuple->src.u.tcp.port);
1177         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1178                 &tuple->dst.u.tcp.port);
1179         return 0;
1180
1181 nfattr_failure:
1182         return -1;
1183 }
1184
1185 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1186                                struct ip_conntrack_tuple *t)
1187 {
1188         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1189                 return -EINVAL;
1190
1191         t->src.u.tcp.port =
1192                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1193         t->dst.u.tcp.port =
1194                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1195
1196         return 0;
1197 }
1198 #endif
1199
1200 /* Returns new sk_buff, or NULL */
1201 struct sk_buff *
1202 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1203 {
1204         skb_orphan(skb);
1205
1206         local_bh_disable();
1207         skb = ip_defrag(skb, user);
1208         local_bh_enable();
1209
1210         if (skb)
1211                 ip_send_check(skb->nh.iph);
1212         return skb;
1213 }
1214
1215 /* Used by ipt_REJECT. */
1216 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1217 {
1218         struct ip_conntrack *ct;
1219         enum ip_conntrack_info ctinfo;
1220
1221         /* This ICMP is in reverse direction to the packet which caused it */
1222         ct = ip_conntrack_get(skb, &ctinfo);
1223
1224         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1225                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1226         else
1227                 ctinfo = IP_CT_RELATED;
1228
1229         /* Attach to new skbuff, and increment count */
1230         nskb->nfct = &ct->ct_general;
1231         nskb->nfctinfo = ctinfo;
1232         nf_conntrack_get(nskb->nfct);
1233 }
1234
1235 /* Bring out ya dead! */
1236 static struct ip_conntrack *
1237 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1238                 void *data, unsigned int *bucket)
1239 {
1240         struct ip_conntrack_tuple_hash *h;
1241         struct ip_conntrack *ct;
1242
1243         write_lock_bh(&ip_conntrack_lock);
1244         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1245                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1246                         ct = tuplehash_to_ctrack(h);
1247                         if (iter(ct, data))
1248                                 goto found;
1249                 }
1250         }
1251         list_for_each_entry(h, &unconfirmed, list) {
1252                 ct = tuplehash_to_ctrack(h);
1253                 if (iter(ct, data))
1254                         goto found;
1255         }
1256         write_unlock_bh(&ip_conntrack_lock);
1257         return NULL;
1258
1259 found:
1260         atomic_inc(&ct->ct_general.use);
1261         write_unlock_bh(&ip_conntrack_lock);
1262         return ct;
1263 }
1264
1265 void
1266 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1267 {
1268         struct ip_conntrack *ct;
1269         unsigned int bucket = 0;
1270
1271         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1272                 /* Time to push up daises... */
1273                 if (del_timer(&ct->timeout))
1274                         death_by_timeout((unsigned long)ct);
1275                 /* ... else the timer will get him soon. */
1276
1277                 ip_conntrack_put(ct);
1278         }
1279 }
1280
1281 /* Fast function for those who don't want to parse /proc (and I don't
1282    blame them). */
1283 /* Reversing the socket's dst/src point of view gives us the reply
1284    mapping. */
1285 static int
1286 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1287 {
1288         struct inet_sock *inet = inet_sk(sk);
1289         struct ip_conntrack_tuple_hash *h;
1290         struct ip_conntrack_tuple tuple;
1291
1292         IP_CT_TUPLE_U_BLANK(&tuple);
1293         tuple.src.ip = inet->rcv_saddr;
1294         tuple.src.u.tcp.port = inet->sport;
1295         tuple.dst.ip = inet->daddr;
1296         tuple.dst.u.tcp.port = inet->dport;
1297         tuple.dst.protonum = IPPROTO_TCP;
1298
1299         /* We only do TCP at the moment: is there a better way? */
1300         if (strcmp(sk->sk_prot->name, "TCP")) {
1301                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1302                 return -ENOPROTOOPT;
1303         }
1304
1305         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1306                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1307                        *len, sizeof(struct sockaddr_in));
1308                 return -EINVAL;
1309         }
1310
1311         h = ip_conntrack_find_get(&tuple, NULL);
1312         if (h) {
1313                 struct sockaddr_in sin;
1314                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1315
1316                 sin.sin_family = AF_INET;
1317                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1318                         .tuple.dst.u.tcp.port;
1319                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1320                         .tuple.dst.ip;
1321                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1322
1323                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1324                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1325                 ip_conntrack_put(ct);
1326                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1327                         return -EFAULT;
1328                 else
1329                         return 0;
1330         }
1331         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1332                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1333                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1334         return -ENOENT;
1335 }
1336
1337 static struct nf_sockopt_ops so_getorigdst = {
1338         .pf             = PF_INET,
1339         .get_optmin     = SO_ORIGINAL_DST,
1340         .get_optmax     = SO_ORIGINAL_DST+1,
1341         .get            = &getorigdst,
1342 };
1343
1344 static int kill_all(struct ip_conntrack *i, void *data)
1345 {
1346         return 1;
1347 }
1348
1349 void ip_conntrack_flush(void)
1350 {
1351         ip_ct_iterate_cleanup(kill_all, NULL);
1352 }
1353
1354 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1355 {
1356         if (vmalloced)
1357                 vfree(hash);
1358         else
1359                 free_pages((unsigned long)hash,
1360                            get_order(sizeof(struct list_head) * size));
1361 }
1362
1363 /* Mishearing the voices in his head, our hero wonders how he's
1364    supposed to kill the mall. */
1365 void ip_conntrack_cleanup(void)
1366 {
1367         rcu_assign_pointer(ip_ct_attach, NULL);
1368
1369         /* This makes sure all current packets have passed through
1370            netfilter framework.  Roll on, two-stage module
1371            delete... */
1372         synchronize_net();
1373
1374         ip_ct_event_cache_flush();
1375  i_see_dead_people:
1376         ip_conntrack_flush();
1377         if (atomic_read(&ip_conntrack_count) != 0) {
1378                 schedule();
1379                 goto i_see_dead_people;
1380         }
1381         /* wait until all references to ip_conntrack_untracked are dropped */
1382         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1383                 schedule();
1384
1385         kmem_cache_destroy(ip_conntrack_cachep);
1386         kmem_cache_destroy(ip_conntrack_expect_cachep);
1387         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1388                             ip_conntrack_htable_size);
1389         nf_unregister_sockopt(&so_getorigdst);
1390 }
1391
1392 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1393 {
1394         struct list_head *hash;
1395         unsigned int i;
1396
1397         *vmalloced = 0;
1398         hash = (void*)__get_free_pages(GFP_KERNEL,
1399                                        get_order(sizeof(struct list_head)
1400                                                  * size));
1401         if (!hash) {
1402                 *vmalloced = 1;
1403                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1404                 hash = vmalloc(sizeof(struct list_head) * size);
1405         }
1406
1407         if (hash)
1408                 for (i = 0; i < size; i++)
1409                         INIT_LIST_HEAD(&hash[i]);
1410
1411         return hash;
1412 }
1413
1414 static int set_hashsize(const char *val, struct kernel_param *kp)
1415 {
1416         int i, bucket, hashsize, vmalloced;
1417         int old_vmalloced, old_size;
1418         int rnd;
1419         struct list_head *hash, *old_hash;
1420         struct ip_conntrack_tuple_hash *h;
1421
1422         /* On boot, we can set this without any fancy locking. */
1423         if (!ip_conntrack_htable_size)
1424                 return param_set_int(val, kp);
1425
1426         hashsize = simple_strtol(val, NULL, 0);
1427         if (!hashsize)
1428                 return -EINVAL;
1429
1430         hash = alloc_hashtable(hashsize, &vmalloced);
1431         if (!hash)
1432                 return -ENOMEM;
1433
1434         /* We have to rehash for the new table anyway, so we also can
1435          * use a new random seed */
1436         get_random_bytes(&rnd, 4);
1437
1438         write_lock_bh(&ip_conntrack_lock);
1439         for (i = 0; i < ip_conntrack_htable_size; i++) {
1440                 while (!list_empty(&ip_conntrack_hash[i])) {
1441                         h = list_entry(ip_conntrack_hash[i].next,
1442                                        struct ip_conntrack_tuple_hash, list);
1443                         list_del(&h->list);
1444                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1445                         list_add_tail(&h->list, &hash[bucket]);
1446                 }
1447         }
1448         old_size = ip_conntrack_htable_size;
1449         old_vmalloced = ip_conntrack_vmalloc;
1450         old_hash = ip_conntrack_hash;
1451
1452         ip_conntrack_htable_size = hashsize;
1453         ip_conntrack_vmalloc = vmalloced;
1454         ip_conntrack_hash = hash;
1455         ip_conntrack_hash_rnd = rnd;
1456         write_unlock_bh(&ip_conntrack_lock);
1457
1458         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1459         return 0;
1460 }
1461
1462 module_param_call(hashsize, set_hashsize, param_get_uint,
1463                   &ip_conntrack_htable_size, 0600);
1464
1465 int __init ip_conntrack_init(void)
1466 {
1467         unsigned int i;
1468         int ret;
1469
1470         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1471          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1472         if (!ip_conntrack_htable_size) {
1473                 ip_conntrack_htable_size
1474                         = (((num_physpages << PAGE_SHIFT) / 16384)
1475                            / sizeof(struct list_head));
1476                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1477                         ip_conntrack_htable_size = 8192;
1478                 if (ip_conntrack_htable_size < 16)
1479                         ip_conntrack_htable_size = 16;
1480         }
1481         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1482
1483         printk("ip_conntrack version %s (%u buckets, %d max)"
1484                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1485                ip_conntrack_htable_size, ip_conntrack_max,
1486                sizeof(struct ip_conntrack));
1487
1488         ret = nf_register_sockopt(&so_getorigdst);
1489         if (ret != 0) {
1490                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1491                 return ret;
1492         }
1493
1494         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1495                                             &ip_conntrack_vmalloc);
1496         if (!ip_conntrack_hash) {
1497                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1498                 goto err_unreg_sockopt;
1499         }
1500
1501         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1502                                                 sizeof(struct ip_conntrack), 0,
1503                                                 0, NULL, NULL);
1504         if (!ip_conntrack_cachep) {
1505                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1506                 goto err_free_hash;
1507         }
1508
1509         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1510                                         sizeof(struct ip_conntrack_expect),
1511                                         0, 0, NULL, NULL);
1512         if (!ip_conntrack_expect_cachep) {
1513                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1514                 goto err_free_conntrack_slab;
1515         }
1516
1517         /* Don't NEED lock here, but good form anyway. */
1518         write_lock_bh(&ip_conntrack_lock);
1519         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1520                 rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol);
1521         /* Sew in builtin protocols. */
1522         rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp);
1523         rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp);
1524         rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp);
1525         write_unlock_bh(&ip_conntrack_lock);
1526
1527         /* For use by ipt_REJECT */
1528         rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach);
1529
1530         /* Set up fake conntrack:
1531             - to never be deleted, not in any hashes */
1532         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1533         /*  - and look it like as a confirmed connection */
1534         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1535
1536         return ret;
1537
1538 err_free_conntrack_slab:
1539         kmem_cache_destroy(ip_conntrack_cachep);
1540 err_free_hash:
1541         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1542                             ip_conntrack_htable_size);
1543 err_unreg_sockopt:
1544         nf_unregister_sockopt(&so_getorigdst);
1545
1546         return -ENOMEM;
1547 }