1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
23 #include <linux/jhash.h>
25 #define ASSERT_READ_LOCK(x)
26 #define ASSERT_WRITE_LOCK(x)
28 #include <linux/netfilter_ipv4/ip_conntrack.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
30 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
31 #include <linux/netfilter_ipv4/ip_nat.h>
32 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
33 #include <linux/netfilter_ipv4/ip_nat_core.h>
34 #include <linux/netfilter_ipv4/ip_nat_helper.h>
35 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36 #include <linux/netfilter_ipv4/listhelp.h>
41 #define DEBUGP(format, args...)
44 DEFINE_RWLOCK(ip_nat_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
51 #define MAX_IP_NAT_PROTO 256
52 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
54 static inline struct ip_nat_protocol *
55 __ip_nat_proto_find(u_int8_t protonum)
57 return ip_nat_protos[protonum];
60 struct ip_nat_protocol *
61 ip_nat_proto_find_get(u_int8_t protonum)
63 struct ip_nat_protocol *p;
65 /* we need to disable preemption to make sure 'p' doesn't get
66 * removed until we've grabbed the reference */
68 p = __ip_nat_proto_find(protonum);
70 if (!try_module_get(p->me))
71 p = &ip_nat_unknown_protocol;
77 EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
80 ip_nat_proto_put(struct ip_nat_protocol *p)
84 EXPORT_SYMBOL_GPL(ip_nat_proto_put);
86 /* We keep an extra hash for each conntrack, for fast searching. */
87 static inline unsigned int
88 hash_by_src(const struct ip_conntrack_tuple *tuple)
90 /* Original src, to ensure we map it consistently if poss. */
91 return jhash_3words(tuple->src.ip, tuple->src.u.all,
92 tuple->dst.protonum, 0) % ip_nat_htable_size;
95 /* Noone using conntrack by the time this called. */
96 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
98 if (!(conn->status & IPS_NAT_DONE_MASK))
101 write_lock_bh(&ip_nat_lock);
102 list_del(&conn->nat.info.bysource);
103 write_unlock_bh(&ip_nat_lock);
106 /* We do checksum mangling, so if they were wrong before they're still
107 * wrong. Also works for incomplete packets (eg. ICMP dest
110 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
112 u_int32_t diffs[] = { oldvalinv, newval };
113 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
116 EXPORT_SYMBOL(ip_nat_cheat_check);
118 /* Is this tuple already taken? (not by us) */
120 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
121 const struct ip_conntrack *ignored_conntrack)
123 /* Conntrack tracking doesn't keep track of outgoing tuples; only
124 incoming ones. NAT means they don't have a fixed mapping,
125 so we invert the tuple and look for the incoming reply.
127 We could keep a separate hash if this proves too slow. */
128 struct ip_conntrack_tuple reply;
130 invert_tuplepr(&reply, tuple);
131 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
133 EXPORT_SYMBOL(ip_nat_used_tuple);
135 /* If we source map this tuple so reply looks like reply_tuple, will
136 * that meet the constraints of range. */
138 in_range(const struct ip_conntrack_tuple *tuple,
139 const struct ip_nat_range *range)
141 struct ip_nat_protocol *proto =
142 __ip_nat_proto_find(tuple->dst.protonum);
144 /* If we are supposed to map IPs, then we must be in the
145 range specified, otherwise let this drag us onto a new src IP. */
146 if (range->flags & IP_NAT_RANGE_MAP_IPS) {
147 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
148 || ntohl(tuple->src.ip) > ntohl(range->max_ip))
152 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
153 || proto->in_range(tuple, IP_NAT_MANIP_SRC,
154 &range->min, &range->max))
161 same_src(const struct ip_conntrack *ct,
162 const struct ip_conntrack_tuple *tuple)
164 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
165 == tuple->dst.protonum
166 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
168 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
169 == tuple->src.u.all);
172 /* Only called for SRC manip */
174 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
175 struct ip_conntrack_tuple *result,
176 const struct ip_nat_range *range)
178 unsigned int h = hash_by_src(tuple);
179 struct ip_conntrack *ct;
181 read_lock_bh(&ip_nat_lock);
182 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
183 if (same_src(ct, tuple)) {
184 /* Copy source part from reply tuple. */
185 invert_tuplepr(result,
186 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
187 result->dst = tuple->dst;
189 if (in_range(result, range)) {
190 read_unlock_bh(&ip_nat_lock);
195 read_unlock_bh(&ip_nat_lock);
199 /* For [FUTURE] fragmentation handling, we want the least-used
200 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
201 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
202 1-65535, we don't do pro-rata allocation based on ports; we choose
203 the ip with the lowest src-ip/dst-ip/proto usage.
206 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
207 const struct ip_nat_range *range,
208 const struct ip_conntrack *conntrack,
209 enum ip_nat_manip_type maniptype)
213 u_int32_t minip, maxip, j;
215 /* No IP mapping? Do nothing. */
216 if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
219 if (maniptype == IP_NAT_MANIP_SRC)
220 var_ipp = &tuple->src.ip;
222 var_ipp = &tuple->dst.ip;
224 /* Fast path: only one choice. */
225 if (range->min_ip == range->max_ip) {
226 *var_ipp = range->min_ip;
230 /* Hashing source and destination IPs gives a fairly even
231 * spread in practice (if there are a small number of IPs
232 * involved, there usually aren't that many connections
233 * anyway). The consistency means that servers see the same
234 * client coming from the same IP (some Internet Banking sites
235 * like this), even across reboots. */
236 minip = ntohl(range->min_ip);
237 maxip = ntohl(range->max_ip);
238 j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
239 *var_ipp = htonl(minip + j % (maxip - minip + 1));
242 /* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
243 * we change the source to map into the range. For NF_IP_PRE_ROUTING
244 * and NF_IP_LOCAL_OUT, we change the destination to map into the
245 * range. It might not be possible to get a unique tuple, but we try.
246 * At worst (or if we race), we will end up with a final duplicate in
247 * __ip_conntrack_confirm and drop the packet. */
249 get_unique_tuple(struct ip_conntrack_tuple *tuple,
250 const struct ip_conntrack_tuple *orig_tuple,
251 const struct ip_nat_range *range,
252 struct ip_conntrack *conntrack,
253 enum ip_nat_manip_type maniptype)
255 struct ip_nat_protocol *proto;
257 /* 1) If this srcip/proto/src-proto-part is currently mapped,
258 and that same mapping gives a unique tuple within the given
261 This is only required for source (ie. NAT/masq) mappings.
262 So far, we don't do local source mappings, so multiple
263 manips not an issue. */
264 if (maniptype == IP_NAT_MANIP_SRC) {
265 if (find_appropriate_src(orig_tuple, tuple, range)) {
266 DEBUGP("get_unique_tuple: Found current src map\n");
267 if (!ip_nat_used_tuple(tuple, conntrack))
272 /* 2) Select the least-used IP/proto combination in the given
274 *tuple = *orig_tuple;
275 find_best_ips_proto(tuple, range, conntrack, maniptype);
277 /* 3) The per-protocol part of the manip is made to map into
278 the range to make a unique tuple. */
280 proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
282 /* Only bother mapping if it's not already in range and unique */
283 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
284 || proto->in_range(tuple, maniptype, &range->min, &range->max))
285 && !ip_nat_used_tuple(tuple, conntrack)) {
286 ip_nat_proto_put(proto);
290 /* Last change: get protocol to try to obtain unique tuple. */
291 proto->unique_tuple(tuple, range, maniptype, conntrack);
293 ip_nat_proto_put(proto);
297 ip_nat_setup_info(struct ip_conntrack *conntrack,
298 const struct ip_nat_range *range,
299 unsigned int hooknum)
301 struct ip_conntrack_tuple curr_tuple, new_tuple;
302 struct ip_nat_info *info = &conntrack->nat.info;
303 int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
304 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
306 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
307 || hooknum == NF_IP_POST_ROUTING
308 || hooknum == NF_IP_LOCAL_IN
309 || hooknum == NF_IP_LOCAL_OUT);
310 BUG_ON(ip_nat_initialized(conntrack, maniptype));
312 /* What we've got will look like inverse of reply. Normally
313 this is what is in the conntrack, except for prior
314 manipulations (future optimization: if num_manips == 0,
316 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
317 invert_tuplepr(&curr_tuple,
318 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
320 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
322 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
323 struct ip_conntrack_tuple reply;
325 /* Alter conntrack table so will recognize replies. */
326 invert_tuplepr(&reply, &new_tuple);
327 ip_conntrack_alter_reply(conntrack, &reply);
329 /* Non-atomic: we own this at the moment. */
330 if (maniptype == IP_NAT_MANIP_SRC)
331 conntrack->status |= IPS_SRC_NAT;
333 conntrack->status |= IPS_DST_NAT;
336 /* Place in source hash if this is the first time. */
339 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
341 write_lock_bh(&ip_nat_lock);
342 list_add(&info->bysource, &bysource[srchash]);
343 write_unlock_bh(&ip_nat_lock);
347 if (maniptype == IP_NAT_MANIP_DST)
348 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
350 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
354 EXPORT_SYMBOL(ip_nat_setup_info);
356 /* Returns true if succeeded. */
358 manip_pkt(u_int16_t proto,
359 struct sk_buff **pskb,
360 unsigned int iphdroff,
361 const struct ip_conntrack_tuple *target,
362 enum ip_nat_manip_type maniptype)
365 struct ip_nat_protocol *p;
367 if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
370 iph = (void *)(*pskb)->data + iphdroff;
372 /* Manipulate protcol part. */
373 p = ip_nat_proto_find_get(proto);
374 if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
380 iph = (void *)(*pskb)->data + iphdroff;
382 if (maniptype == IP_NAT_MANIP_SRC) {
383 iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
385 iph->saddr = target->src.ip;
387 iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
389 iph->daddr = target->dst.ip;
394 /* Do packet manipulations according to ip_nat_setup_info. */
395 unsigned int ip_nat_packet(struct ip_conntrack *ct,
396 enum ip_conntrack_info ctinfo,
397 unsigned int hooknum,
398 struct sk_buff **pskb)
400 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
401 unsigned long statusbit;
402 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
404 if (mtype == IP_NAT_MANIP_SRC)
405 statusbit = IPS_SRC_NAT;
407 statusbit = IPS_DST_NAT;
409 /* Invert if this is reply dir. */
410 if (dir == IP_CT_DIR_REPLY)
411 statusbit ^= IPS_NAT_MASK;
413 /* Non-atomic: these bits don't change. */
414 if (ct->status & statusbit) {
415 struct ip_conntrack_tuple target;
417 /* We are aiming to look like inverse of other direction. */
418 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
420 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
425 EXPORT_SYMBOL_GPL(ip_nat_packet);
427 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
428 int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
429 struct ip_conntrack *ct,
430 enum ip_nat_manip_type manip,
431 enum ip_conntrack_dir dir)
437 struct ip_conntrack_tuple inner, target;
438 int hdrlen = (*pskb)->nh.iph->ihl * 4;
440 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
443 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
445 /* We're actually going to mangle it beyond trivial checksum
446 adjustment, so make sure the current checksum is correct. */
447 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
448 hdrlen = (*pskb)->nh.iph->ihl * 4;
449 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
450 (*pskb)->len - hdrlen, 0)))
454 /* Must be RELATED */
455 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
456 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
458 /* Redirects on non-null nats must be dropped, else they'll
459 start talking to each other without our translation, and be
461 if (inside->icmp.type == ICMP_REDIRECT) {
462 /* If NAT isn't finished, assume it and drop. */
463 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
466 if (ct->status & IPS_NAT_MASK)
470 DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
471 *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
473 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
474 sizeof(struct icmphdr) + inside->ip.ihl*4,
476 __ip_conntrack_proto_find(inside->ip.protocol)))
479 /* Change inner back to look like incoming packet. We do the
480 opposite manip on this hook to normal, because it might not
481 pass all hooks (locally-generated ICMP). Consider incoming
482 packet: PREROUTING (DST manip), routing produces ICMP, goes
483 through POSTROUTING (which must correct the DST manip). */
484 if (!manip_pkt(inside->ip.protocol, pskb,
485 (*pskb)->nh.iph->ihl*4
486 + sizeof(inside->icmp),
487 &ct->tuplehash[!dir].tuple,
491 /* Reloading "inside" here since manip_pkt inner. */
492 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
493 inside->icmp.checksum = 0;
494 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
495 (*pskb)->len - hdrlen,
498 /* Change outer to look the reply to an incoming packet
499 * (proto 0 means don't invert per-proto part). */
501 /* Obviously, we need to NAT destination IP, but source IP
502 should be NAT'ed only if it is from a NAT'd host.
504 Explanation: some people use NAT for anonymizing. Also,
505 CERT recommends dropping all packets from private IP
506 addresses (although ICMP errors from internal links with
507 such addresses are not too uncommon, as Alan Cox points
509 if (manip != IP_NAT_MANIP_SRC
510 || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
511 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
512 if (!manip_pkt(0, pskb, 0, &target, manip))
518 EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
520 /* Protocol registration. */
521 int ip_nat_protocol_register(struct ip_nat_protocol *proto)
525 write_lock_bh(&ip_nat_lock);
526 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
530 ip_nat_protos[proto->protonum] = proto;
532 write_unlock_bh(&ip_nat_lock);
535 EXPORT_SYMBOL(ip_nat_protocol_register);
537 /* Noone stores the protocol anywhere; simply delete it. */
538 void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
540 write_lock_bh(&ip_nat_lock);
541 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
542 write_unlock_bh(&ip_nat_lock);
544 /* Someone could be still looking at the proto in a bh. */
547 EXPORT_SYMBOL(ip_nat_protocol_unregister);
549 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
550 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
552 ip_nat_port_range_to_nfattr(struct sk_buff *skb,
553 const struct ip_nat_range *range)
555 NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
556 &range->min.tcp.port);
557 NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
558 &range->max.tcp.port);
567 ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
571 /* we have to return whether we actually parsed something or not */
573 if (tb[CTA_PROTONAT_PORT_MIN-1]) {
575 range->min.tcp.port =
576 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
579 if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
581 range->max.tcp.port = range->min.tcp.port;
584 range->max.tcp.port =
585 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
590 EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
591 EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
594 static int __init ip_nat_init(void)
598 /* Leave them the same for the moment. */
599 ip_nat_htable_size = ip_conntrack_htable_size;
601 /* One vmalloc for both hash tables */
602 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
606 /* Sew in builtin protocols. */
607 write_lock_bh(&ip_nat_lock);
608 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
609 ip_nat_protos[i] = &ip_nat_unknown_protocol;
610 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
611 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
612 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
613 write_unlock_bh(&ip_nat_lock);
615 for (i = 0; i < ip_nat_htable_size; i++) {
616 INIT_LIST_HEAD(&bysource[i]);
619 /* FIXME: Man, this is a hack. <SIGH> */
620 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
621 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
623 /* Initialize fake conntrack so that NAT will skip it */
624 ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
628 /* Clear NAT section of all conntracks, in case we're loaded again. */
629 static int clean_nat(struct ip_conntrack *i, void *data)
631 memset(&i->nat, 0, sizeof(i->nat));
632 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
636 static void __exit ip_nat_cleanup(void)
638 ip_ct_iterate_cleanup(&clean_nat, NULL);
639 ip_conntrack_destroyed = NULL;
643 MODULE_LICENSE("GPL");
645 module_init(ip_nat_init);
646 module_exit(ip_nat_cleanup);