2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
153 rover = tcp_hashinfo.port_rover;
158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
177 if (unlikely(remaining <= 0))
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
202 if (tcp_bind_conflict(sk, tb))
208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
225 spin_unlock(&head->lock);
231 static void tcp_v4_hash(struct sock *sk)
233 inet_hash(&tcp_hashinfo, sk);
236 void tcp_unhash(struct sock *sk)
238 inet_unhash(&tcp_hashinfo, sk);
241 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
243 return secure_tcp_sequence_number(skb->nh.iph->daddr,
249 /* called with local bh disabled */
250 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
251 struct inet_timewait_sock **twp)
253 struct inet_sock *inet = inet_sk(sk);
254 u32 daddr = inet->rcv_saddr;
255 u32 saddr = inet->daddr;
256 int dif = sk->sk_bound_dev_if;
257 INET_ADDR_COOKIE(acookie, saddr, daddr)
258 const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
259 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
260 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
262 const struct hlist_node *node;
263 struct inet_timewait_sock *tw;
265 write_lock(&head->lock);
267 /* Check TIME-WAIT sockets first. */
268 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
271 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
272 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
273 struct tcp_sock *tp = tcp_sk(sk);
275 /* With PAWS, it is safe from the viewpoint
276 of data integrity. Even without PAWS it
277 is safe provided sequence spaces do not
278 overlap i.e. at data rates <= 80Mbit/sec.
280 Actually, the idea is close to VJ's one,
281 only timestamp cache is held not per host,
282 but per port pair and TW bucket is used
285 If TW bucket has been already destroyed we
286 fall back to VJ's scheme and use initial
287 timestamp retrieved from peer table.
289 if (tcptw->tw_ts_recent_stamp &&
290 (!twp || (sysctl_tcp_tw_reuse &&
292 tcptw->tw_ts_recent_stamp > 1))) {
293 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
294 if (tp->write_seq == 0)
296 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
297 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
306 /* And established part... */
307 sk_for_each(sk2, node, &head->chain) {
308 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
313 /* Must record num and sport now. Otherwise we will see
314 * in hash table socket with a funny identity. */
316 inet->sport = htons(lport);
317 sk->sk_hashent = hash;
318 BUG_TRAP(sk_unhashed(sk));
319 __sk_add_node(sk, &head->chain);
320 sock_prot_inc_use(sk->sk_prot);
321 write_unlock(&head->lock);
325 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
327 /* Silly. Should hash-dance instead... */
328 tcp_tw_deschedule(tw);
329 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
337 write_unlock(&head->lock);
338 return -EADDRNOTAVAIL;
341 static inline u32 connect_port_offset(const struct sock *sk)
343 const struct inet_sock *inet = inet_sk(sk);
345 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
350 * Bind a port for a connect operation and hash it.
352 static inline int tcp_v4_hash_connect(struct sock *sk)
354 const unsigned short snum = inet_sk(sk)->num;
355 struct inet_bind_hashbucket *head;
356 struct inet_bind_bucket *tb;
360 int low = sysctl_local_port_range[0];
361 int high = sysctl_local_port_range[1];
362 int range = high - low;
366 u32 offset = hint + connect_port_offset(sk);
367 struct hlist_node *node;
368 struct inet_timewait_sock *tw = NULL;
371 for (i = 1; i <= range; i++) {
372 port = low + (i + offset) % range;
373 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
374 spin_lock(&head->lock);
376 /* Does not bother with rcv_saddr checks,
377 * because the established check is already
380 inet_bind_bucket_for_each(tb, node, &head->chain) {
381 if (tb->port == port) {
382 BUG_TRAP(!hlist_empty(&tb->owners));
383 if (tb->fastreuse >= 0)
385 if (!__tcp_v4_check_established(sk,
393 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
395 spin_unlock(&head->lock);
402 spin_unlock(&head->lock);
406 return -EADDRNOTAVAIL;
411 /* Head lock still held and bh's disabled */
412 inet_bind_hash(sk, tb, port);
413 if (sk_unhashed(sk)) {
414 inet_sk(sk)->sport = htons(port);
415 __inet_hash(&tcp_hashinfo, sk, 0);
417 spin_unlock(&head->lock);
420 tcp_tw_deschedule(tw);
428 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
429 tb = inet_sk(sk)->bind_hash;
430 spin_lock_bh(&head->lock);
431 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
432 __inet_hash(&tcp_hashinfo, sk, 0);
433 spin_unlock_bh(&head->lock);
436 spin_unlock(&head->lock);
437 /* No definite answer... Walk to established hash table */
438 ret = __tcp_v4_check_established(sk, snum, NULL);
445 /* This will initiate an outgoing connection. */
446 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
448 struct inet_sock *inet = inet_sk(sk);
449 struct tcp_sock *tp = tcp_sk(sk);
450 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
456 if (addr_len < sizeof(struct sockaddr_in))
459 if (usin->sin_family != AF_INET)
460 return -EAFNOSUPPORT;
462 nexthop = daddr = usin->sin_addr.s_addr;
463 if (inet->opt && inet->opt->srr) {
466 nexthop = inet->opt->faddr;
469 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
470 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
472 inet->sport, usin->sin_port, sk);
476 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
481 if (!inet->opt || !inet->opt->srr)
485 inet->saddr = rt->rt_src;
486 inet->rcv_saddr = inet->saddr;
488 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
489 /* Reset inherited state */
490 tp->rx_opt.ts_recent = 0;
491 tp->rx_opt.ts_recent_stamp = 0;
495 if (sysctl_tcp_tw_recycle &&
496 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
497 struct inet_peer *peer = rt_get_peer(rt);
499 /* VJ's idea. We save last timestamp seen from
500 * the destination in peer table, when entering state TIME-WAIT
501 * and initialize rx_opt.ts_recent from it, when trying new connection.
504 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
505 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
506 tp->rx_opt.ts_recent = peer->tcp_ts;
510 inet->dport = usin->sin_port;
513 tp->ext_header_len = 0;
515 tp->ext_header_len = inet->opt->optlen;
517 tp->rx_opt.mss_clamp = 536;
519 /* Socket identity is still unknown (sport may be zero).
520 * However we set state to SYN-SENT and not releasing socket
521 * lock select source port, enter ourselves into the hash tables and
522 * complete initialization after this.
524 tcp_set_state(sk, TCP_SYN_SENT);
525 err = tcp_v4_hash_connect(sk);
529 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
533 /* OK, now commit destination to socket. */
534 sk_setup_caps(sk, &rt->u.dst);
537 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
542 inet->id = tp->write_seq ^ jiffies;
544 err = tcp_connect(sk);
552 /* This unhashes the socket and releases the local port, if necessary. */
553 tcp_set_state(sk, TCP_CLOSE);
555 sk->sk_route_caps = 0;
560 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
562 return ((struct rtable *)skb->dst)->rt_iif;
565 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
567 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
570 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
571 struct request_sock ***prevp,
573 __u32 raddr, __u32 laddr)
575 struct listen_sock *lopt = tp->accept_queue.listen_opt;
576 struct request_sock *req, **prev;
578 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
579 (req = *prev) != NULL;
580 prev = &req->dl_next) {
581 const struct inet_request_sock *ireq = inet_rsk(req);
583 if (ireq->rmt_port == rport &&
584 ireq->rmt_addr == raddr &&
585 ireq->loc_addr == laddr &&
586 TCP_INET_FAMILY(req->rsk_ops->family)) {
596 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
598 struct tcp_sock *tp = tcp_sk(sk);
599 struct listen_sock *lopt = tp->accept_queue.listen_opt;
600 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
602 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
608 * This routine does path mtu discovery as defined in RFC1191.
610 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
613 struct dst_entry *dst;
614 struct inet_sock *inet = inet_sk(sk);
615 struct tcp_sock *tp = tcp_sk(sk);
617 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
618 * send out by Linux are always <576bytes so they should go through
621 if (sk->sk_state == TCP_LISTEN)
624 /* We don't check in the destentry if pmtu discovery is forbidden
625 * on this route. We just assume that no packet_to_big packets
626 * are send back when pmtu discovery is not active.
627 * There is a small race when the user changes this flag in the
628 * route, but I think that's acceptable.
630 if ((dst = __sk_dst_check(sk, 0)) == NULL)
633 dst->ops->update_pmtu(dst, mtu);
635 /* Something is about to be wrong... Remember soft error
636 * for the case, if this connection will not able to recover.
638 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
639 sk->sk_err_soft = EMSGSIZE;
643 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
644 tp->pmtu_cookie > mtu) {
645 tcp_sync_mss(sk, mtu);
647 /* Resend the TCP packet because it's
648 * clear that the old packet has been
649 * dropped. This is the new "fast" path mtu
652 tcp_simple_retransmit(sk);
653 } /* else let the usual retransmit timer handle it */
657 * This routine is called by the ICMP module when it gets some
658 * sort of error condition. If err < 0 then the socket should
659 * be closed and the error returned to the user. If err > 0
660 * it's just the icmp type << 8 | icmp code. After adjustment
661 * header points to the first 8 bytes of the tcp header. We need
662 * to find the appropriate port.
664 * The locking strategy used here is very "optimistic". When
665 * someone else accesses the socket the ICMP is just dropped
666 * and for some paths there is no check at all.
667 * A more general error queue to queue errors for later handling
668 * is probably better.
672 void tcp_v4_err(struct sk_buff *skb, u32 info)
674 struct iphdr *iph = (struct iphdr *)skb->data;
675 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
677 struct inet_sock *inet;
678 int type = skb->h.icmph->type;
679 int code = skb->h.icmph->code;
684 if (skb->len < (iph->ihl << 2) + 8) {
685 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
689 sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
690 th->source, tcp_v4_iif(skb));
692 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
695 if (sk->sk_state == TCP_TIME_WAIT) {
696 inet_twsk_put((struct inet_timewait_sock *)sk);
701 /* If too many ICMPs get dropped on busy
702 * servers this needs to be solved differently.
704 if (sock_owned_by_user(sk))
705 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
707 if (sk->sk_state == TCP_CLOSE)
711 seq = ntohl(th->seq);
712 if (sk->sk_state != TCP_LISTEN &&
713 !between(seq, tp->snd_una, tp->snd_nxt)) {
714 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
719 case ICMP_SOURCE_QUENCH:
720 /* Just silently ignore these. */
722 case ICMP_PARAMETERPROB:
725 case ICMP_DEST_UNREACH:
726 if (code > NR_ICMP_UNREACH)
729 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
730 if (!sock_owned_by_user(sk))
731 do_pmtu_discovery(sk, iph, info);
735 err = icmp_err_convert[code].errno;
737 case ICMP_TIME_EXCEEDED:
744 switch (sk->sk_state) {
745 struct request_sock *req, **prev;
747 if (sock_owned_by_user(sk))
750 req = tcp_v4_search_req(tp, &prev, th->dest,
751 iph->daddr, iph->saddr);
755 /* ICMPs are not backlogged, hence we cannot get
756 an established socket here.
760 if (seq != tcp_rsk(req)->snt_isn) {
761 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
766 * Still in SYN_RECV, just remove it silently.
767 * There is no good way to pass the error to the newly
768 * created socket, and POSIX does not want network
769 * errors returned from accept().
771 tcp_synq_drop(sk, req, prev);
775 case TCP_SYN_RECV: /* Cannot happen.
776 It can f.e. if SYNs crossed.
778 if (!sock_owned_by_user(sk)) {
779 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
782 sk->sk_error_report(sk);
786 sk->sk_err_soft = err;
791 /* If we've already connected we will keep trying
792 * until we time out, or the user gives up.
794 * rfc1122 4.2.3.9 allows to consider as hard errors
795 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
796 * but it is obsoleted by pmtu discovery).
798 * Note, that in modern internet, where routing is unreliable
799 * and in each dark corner broken firewalls sit, sending random
800 * errors ordered by their masters even this two messages finally lose
801 * their original sense (even Linux sends invalid PORT_UNREACHs)
803 * Now we are in compliance with RFCs.
808 if (!sock_owned_by_user(sk) && inet->recverr) {
810 sk->sk_error_report(sk);
811 } else { /* Only an error on timeout */
812 sk->sk_err_soft = err;
820 /* This routine computes an IPv4 TCP checksum. */
821 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
824 struct inet_sock *inet = inet_sk(sk);
826 if (skb->ip_summed == CHECKSUM_HW) {
827 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
828 skb->csum = offsetof(struct tcphdr, check);
830 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
831 csum_partial((char *)th,
838 * This routine will send an RST to the other tcp.
840 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
842 * Answer: if a packet caused RST, it is not for a socket
843 * existing in our system, if it is matched to a socket,
844 * it is just duplicate segment or bug in other side's TCP.
845 * So that we build reply only basing on parameters
846 * arrived with segment.
847 * Exception: precedence violation. We do not implement it in any case.
850 static void tcp_v4_send_reset(struct sk_buff *skb)
852 struct tcphdr *th = skb->h.th;
854 struct ip_reply_arg arg;
856 /* Never send a reset in response to a reset. */
860 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
863 /* Swap the send and the receive. */
864 memset(&rth, 0, sizeof(struct tcphdr));
865 rth.dest = th->source;
866 rth.source = th->dest;
867 rth.doff = sizeof(struct tcphdr) / 4;
871 rth.seq = th->ack_seq;
874 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
875 skb->len - (th->doff << 2));
878 memset(&arg, 0, sizeof arg);
879 arg.iov[0].iov_base = (unsigned char *)&rth;
880 arg.iov[0].iov_len = sizeof rth;
881 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
882 skb->nh.iph->saddr, /*XXX*/
883 sizeof(struct tcphdr), IPPROTO_TCP, 0);
884 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
886 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
888 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
889 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
892 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
893 outside socket context is ugly, certainly. What can I do?
896 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
899 struct tcphdr *th = skb->h.th;
904 struct ip_reply_arg arg;
906 memset(&rep.th, 0, sizeof(struct tcphdr));
907 memset(&arg, 0, sizeof arg);
909 arg.iov[0].iov_base = (unsigned char *)&rep;
910 arg.iov[0].iov_len = sizeof(rep.th);
912 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
913 (TCPOPT_TIMESTAMP << 8) |
915 rep.tsopt[1] = htonl(tcp_time_stamp);
916 rep.tsopt[2] = htonl(ts);
917 arg.iov[0].iov_len = sizeof(rep);
920 /* Swap the send and the receive. */
921 rep.th.dest = th->source;
922 rep.th.source = th->dest;
923 rep.th.doff = arg.iov[0].iov_len / 4;
924 rep.th.seq = htonl(seq);
925 rep.th.ack_seq = htonl(ack);
927 rep.th.window = htons(win);
929 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
930 skb->nh.iph->saddr, /*XXX*/
931 arg.iov[0].iov_len, IPPROTO_TCP, 0);
932 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
934 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
936 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
939 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
941 struct inet_timewait_sock *tw = inet_twsk(sk);
942 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
944 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
945 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
950 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
952 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
956 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
957 struct request_sock *req)
960 const struct inet_request_sock *ireq = inet_rsk(req);
961 struct ip_options *opt = inet_rsk(req)->opt;
962 struct flowi fl = { .oif = sk->sk_bound_dev_if,
964 { .daddr = ((opt && opt->srr) ?
967 .saddr = ireq->loc_addr,
968 .tos = RT_CONN_FLAGS(sk) } },
969 .proto = IPPROTO_TCP,
971 { .sport = inet_sk(sk)->sport,
972 .dport = ireq->rmt_port } } };
974 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
975 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
978 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
980 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
987 * Send a SYN-ACK after having received an ACK.
988 * This still operates on a request_sock only, not on a big
991 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
992 struct dst_entry *dst)
994 const struct inet_request_sock *ireq = inet_rsk(req);
996 struct sk_buff * skb;
998 /* First, grab a route. */
999 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1002 skb = tcp_make_synack(sk, dst, req);
1005 struct tcphdr *th = skb->h.th;
1007 th->check = tcp_v4_check(th, skb->len,
1010 csum_partial((char *)th, skb->len,
1013 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1016 if (err == NET_XMIT_CN)
1026 * IPv4 request_sock destructor.
1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1030 if (inet_rsk(req)->opt)
1031 kfree(inet_rsk(req)->opt);
1034 static inline void syn_flood_warning(struct sk_buff *skb)
1036 static unsigned long warntime;
1038 if (time_after(jiffies, (warntime + HZ * 60))) {
1041 "possible SYN flooding on port %d. Sending cookies.\n",
1042 ntohs(skb->h.th->dest));
1047 * Save and compile IPv4 options into the request_sock if needed.
1049 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1050 struct sk_buff *skb)
1052 struct ip_options *opt = &(IPCB(skb)->opt);
1053 struct ip_options *dopt = NULL;
1055 if (opt && opt->optlen) {
1056 int opt_size = optlength(opt);
1057 dopt = kmalloc(opt_size, GFP_ATOMIC);
1059 if (ip_options_echo(dopt, skb)) {
1068 struct request_sock_ops tcp_request_sock_ops = {
1070 .obj_size = sizeof(struct tcp_request_sock),
1071 .rtx_syn_ack = tcp_v4_send_synack,
1072 .send_ack = tcp_v4_reqsk_send_ack,
1073 .destructor = tcp_v4_reqsk_destructor,
1074 .send_reset = tcp_v4_send_reset,
1077 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1079 struct inet_request_sock *ireq;
1080 struct tcp_options_received tmp_opt;
1081 struct request_sock *req;
1082 __u32 saddr = skb->nh.iph->saddr;
1083 __u32 daddr = skb->nh.iph->daddr;
1084 __u32 isn = TCP_SKB_CB(skb)->when;
1085 struct dst_entry *dst = NULL;
1086 #ifdef CONFIG_SYN_COOKIES
1087 int want_cookie = 0;
1089 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1092 /* Never answer to SYNs send to broadcast or multicast */
1093 if (((struct rtable *)skb->dst)->rt_flags &
1094 (RTCF_BROADCAST | RTCF_MULTICAST))
1097 /* TW buckets are converted to open requests without
1098 * limitations, they conserve resources and peer is
1099 * evidently real one.
1101 if (tcp_synq_is_full(sk) && !isn) {
1102 #ifdef CONFIG_SYN_COOKIES
1103 if (sysctl_tcp_syncookies) {
1110 /* Accept backlog is full. If we have already queued enough
1111 * of warm entries in syn queue, drop request. It is better than
1112 * clogging syn queue with openreqs with exponentially increasing
1115 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1118 req = reqsk_alloc(&tcp_request_sock_ops);
1122 tcp_clear_options(&tmp_opt);
1123 tmp_opt.mss_clamp = 536;
1124 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1126 tcp_parse_options(skb, &tmp_opt, 0);
1129 tcp_clear_options(&tmp_opt);
1130 tmp_opt.saw_tstamp = 0;
1133 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1134 /* Some OSes (unknown ones, but I see them on web server, which
1135 * contains information interesting only for windows'
1136 * users) do not send their stamp in SYN. It is easy case.
1137 * We simply do not advertise TS support.
1139 tmp_opt.saw_tstamp = 0;
1140 tmp_opt.tstamp_ok = 0;
1142 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1144 tcp_openreq_init(req, &tmp_opt, skb);
1146 ireq = inet_rsk(req);
1147 ireq->loc_addr = daddr;
1148 ireq->rmt_addr = saddr;
1149 ireq->opt = tcp_v4_save_options(sk, skb);
1151 TCP_ECN_create_request(req, skb->h.th);
1154 #ifdef CONFIG_SYN_COOKIES
1155 syn_flood_warning(skb);
1157 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1159 struct inet_peer *peer = NULL;
1161 /* VJ's idea. We save last timestamp seen
1162 * from the destination in peer table, when entering
1163 * state TIME-WAIT, and check against it before
1164 * accepting new connection request.
1166 * If "isn" is not zero, this request hit alive
1167 * timewait bucket, so that all the necessary checks
1168 * are made in the function processing timewait state.
1170 if (tmp_opt.saw_tstamp &&
1171 sysctl_tcp_tw_recycle &&
1172 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1173 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1174 peer->v4daddr == saddr) {
1175 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1176 (s32)(peer->tcp_ts - req->ts_recent) >
1178 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1183 /* Kill the following clause, if you dislike this way. */
1184 else if (!sysctl_tcp_syncookies &&
1185 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1186 (sysctl_max_syn_backlog >> 2)) &&
1187 (!peer || !peer->tcp_ts_stamp) &&
1188 (!dst || !dst_metric(dst, RTAX_RTT))) {
1189 /* Without syncookies last quarter of
1190 * backlog is filled with destinations,
1191 * proven to be alive.
1192 * It means that we continue to communicate
1193 * to destinations, already remembered
1194 * to the moment of synflood.
1196 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1197 "request from %u.%u."
1200 ntohs(skb->h.th->source)));
1205 isn = tcp_v4_init_sequence(sk, skb);
1207 tcp_rsk(req)->snt_isn = isn;
1209 if (tcp_v4_send_synack(sk, req, dst))
1215 tcp_v4_synq_add(sk, req);
1222 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1228 * The three way handshake has completed - we got a valid synack -
1229 * now create the new socket.
1231 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1232 struct request_sock *req,
1233 struct dst_entry *dst)
1235 struct inet_request_sock *ireq;
1236 struct inet_sock *newinet;
1237 struct tcp_sock *newtp;
1240 if (sk_acceptq_is_full(sk))
1243 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1246 newsk = tcp_create_openreq_child(sk, req, skb);
1250 sk_setup_caps(newsk, dst);
1252 newtp = tcp_sk(newsk);
1253 newinet = inet_sk(newsk);
1254 ireq = inet_rsk(req);
1255 newinet->daddr = ireq->rmt_addr;
1256 newinet->rcv_saddr = ireq->loc_addr;
1257 newinet->saddr = ireq->loc_addr;
1258 newinet->opt = ireq->opt;
1260 newinet->mc_index = tcp_v4_iif(skb);
1261 newinet->mc_ttl = skb->nh.iph->ttl;
1262 newtp->ext_header_len = 0;
1264 newtp->ext_header_len = newinet->opt->optlen;
1265 newinet->id = newtp->write_seq ^ jiffies;
1267 tcp_sync_mss(newsk, dst_mtu(dst));
1268 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1269 tcp_initialize_rcv_mss(newsk);
1271 __inet_hash(&tcp_hashinfo, newsk, 0);
1272 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1277 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1279 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1284 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1286 struct tcphdr *th = skb->h.th;
1287 struct iphdr *iph = skb->nh.iph;
1288 struct tcp_sock *tp = tcp_sk(sk);
1290 struct request_sock **prev;
1291 /* Find possible connection requests. */
1292 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1293 iph->saddr, iph->daddr);
1295 return tcp_check_req(sk, skb, req, prev);
1297 nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1298 th->source, skb->nh.iph->daddr,
1299 ntohs(th->dest), tcp_v4_iif(skb));
1302 if (nsk->sk_state != TCP_TIME_WAIT) {
1306 inet_twsk_put((struct inet_timewait_sock *)nsk);
1310 #ifdef CONFIG_SYN_COOKIES
1311 if (!th->rst && !th->syn && th->ack)
1312 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1317 static int tcp_v4_checksum_init(struct sk_buff *skb)
1319 if (skb->ip_summed == CHECKSUM_HW) {
1320 skb->ip_summed = CHECKSUM_UNNECESSARY;
1321 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1322 skb->nh.iph->daddr, skb->csum))
1325 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1326 skb->ip_summed = CHECKSUM_NONE;
1328 if (skb->len <= 76) {
1329 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1331 skb_checksum(skb, 0, skb->len, 0)))
1333 skb->ip_summed = CHECKSUM_UNNECESSARY;
1335 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1337 skb->nh.iph->daddr, 0);
1343 /* The socket must have it's spinlock held when we get
1346 * We have a potential double-lock case here, so even when
1347 * doing backlog processing we use the BH locking scheme.
1348 * This is because we cannot sleep with the original spinlock
1351 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1353 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1354 TCP_CHECK_TIMER(sk);
1355 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1357 TCP_CHECK_TIMER(sk);
1361 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1364 if (sk->sk_state == TCP_LISTEN) {
1365 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1370 if (tcp_child_process(sk, nsk, skb))
1376 TCP_CHECK_TIMER(sk);
1377 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1379 TCP_CHECK_TIMER(sk);
1383 tcp_v4_send_reset(skb);
1386 /* Be careful here. If this function gets more complicated and
1387 * gcc suffers from register pressure on the x86, sk (in %ebx)
1388 * might be destroyed here. This current version compiles correctly,
1389 * but you have been warned.
1394 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1402 int tcp_v4_rcv(struct sk_buff *skb)
1408 if (skb->pkt_type != PACKET_HOST)
1411 /* Count it even if it's bad */
1412 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1414 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1419 if (th->doff < sizeof(struct tcphdr) / 4)
1421 if (!pskb_may_pull(skb, th->doff * 4))
1424 /* An explanation is required here, I think.
1425 * Packet length and doff are validated by header prediction,
1426 * provided case of th->doff==0 is elimineted.
1427 * So, we defer the checks. */
1428 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1429 tcp_v4_checksum_init(skb) < 0))
1433 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1434 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1435 skb->len - th->doff * 4);
1436 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1437 TCP_SKB_CB(skb)->when = 0;
1438 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1439 TCP_SKB_CB(skb)->sacked = 0;
1441 sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1442 skb->nh.iph->daddr, ntohs(th->dest),
1449 if (sk->sk_state == TCP_TIME_WAIT)
1452 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1453 goto discard_and_relse;
1455 if (sk_filter(sk, skb, 0))
1456 goto discard_and_relse;
1462 if (!sock_owned_by_user(sk)) {
1463 if (!tcp_prequeue(sk, skb))
1464 ret = tcp_v4_do_rcv(sk, skb);
1466 sk_add_backlog(sk, skb);
1474 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1477 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1479 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1481 tcp_v4_send_reset(skb);
1485 /* Discard frame. */
1494 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1495 inet_twsk_put((struct inet_timewait_sock *) sk);
1499 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1500 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1501 inet_twsk_put((struct inet_timewait_sock *) sk);
1504 switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1507 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1512 tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1513 inet_twsk_put((struct inet_timewait_sock *)sk);
1517 /* Fall through to ACK */
1520 tcp_v4_timewait_ack(sk, skb);
1524 case TCP_TW_SUCCESS:;
1529 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1531 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1532 struct inet_sock *inet = inet_sk(sk);
1534 sin->sin_family = AF_INET;
1535 sin->sin_addr.s_addr = inet->daddr;
1536 sin->sin_port = inet->dport;
1539 /* VJ's idea. Save last timestamp seen from this destination
1540 * and hold it at least for normal timewait interval to use for duplicate
1541 * segment detection in subsequent connections, before they enter synchronized
1545 int tcp_v4_remember_stamp(struct sock *sk)
1547 struct inet_sock *inet = inet_sk(sk);
1548 struct tcp_sock *tp = tcp_sk(sk);
1549 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1550 struct inet_peer *peer = NULL;
1553 if (!rt || rt->rt_dst != inet->daddr) {
1554 peer = inet_getpeer(inet->daddr, 1);
1558 rt_bind_peer(rt, 1);
1563 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1564 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1565 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1566 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1567 peer->tcp_ts = tp->rx_opt.ts_recent;
1577 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1579 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1582 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1584 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1585 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1586 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1587 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1588 peer->tcp_ts = tcptw->tw_ts_recent;
1597 struct tcp_func ipv4_specific = {
1598 .queue_xmit = ip_queue_xmit,
1599 .send_check = tcp_v4_send_check,
1600 .rebuild_header = inet_sk_rebuild_header,
1601 .conn_request = tcp_v4_conn_request,
1602 .syn_recv_sock = tcp_v4_syn_recv_sock,
1603 .remember_stamp = tcp_v4_remember_stamp,
1604 .net_header_len = sizeof(struct iphdr),
1605 .setsockopt = ip_setsockopt,
1606 .getsockopt = ip_getsockopt,
1607 .addr2sockaddr = v4_addr2sockaddr,
1608 .sockaddr_len = sizeof(struct sockaddr_in),
1611 /* NOTE: A lot of things set to zero explicitly by call to
1612 * sk_alloc() so need not be done here.
1614 static int tcp_v4_init_sock(struct sock *sk)
1616 struct tcp_sock *tp = tcp_sk(sk);
1618 skb_queue_head_init(&tp->out_of_order_queue);
1619 tcp_init_xmit_timers(sk);
1620 tcp_prequeue_init(tp);
1622 tp->rto = TCP_TIMEOUT_INIT;
1623 tp->mdev = TCP_TIMEOUT_INIT;
1625 /* So many TCP implementations out there (incorrectly) count the
1626 * initial SYN frame in their delayed-ACK and congestion control
1627 * algorithms that we must have the following bandaid to talk
1628 * efficiently to them. -DaveM
1632 /* See draft-stevens-tcpca-spec-01 for discussion of the
1633 * initialization of these values.
1635 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1636 tp->snd_cwnd_clamp = ~0;
1637 tp->mss_cache = 536;
1639 tp->reordering = sysctl_tcp_reordering;
1640 tp->ca_ops = &tcp_init_congestion_ops;
1642 sk->sk_state = TCP_CLOSE;
1644 sk->sk_write_space = sk_stream_write_space;
1645 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1647 tp->af_specific = &ipv4_specific;
1649 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1650 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1652 atomic_inc(&tcp_sockets_allocated);
1657 int tcp_v4_destroy_sock(struct sock *sk)
1659 struct tcp_sock *tp = tcp_sk(sk);
1661 tcp_clear_xmit_timers(sk);
1663 tcp_cleanup_congestion_control(tp);
1665 /* Cleanup up the write buffer. */
1666 sk_stream_writequeue_purge(sk);
1668 /* Cleans up our, hopefully empty, out_of_order_queue. */
1669 __skb_queue_purge(&tp->out_of_order_queue);
1671 /* Clean prequeue, it must be empty really */
1672 __skb_queue_purge(&tp->ucopy.prequeue);
1674 /* Clean up a referenced TCP bind bucket. */
1675 if (inet_sk(sk)->bind_hash)
1676 inet_put_port(&tcp_hashinfo, sk);
1679 * If sendmsg cached page exists, toss it.
1681 if (sk->sk_sndmsg_page) {
1682 __free_page(sk->sk_sndmsg_page);
1683 sk->sk_sndmsg_page = NULL;
1686 atomic_dec(&tcp_sockets_allocated);
1691 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1693 #ifdef CONFIG_PROC_FS
1694 /* Proc filesystem TCP sock list dumping. */
1696 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1698 return hlist_empty(head) ? NULL :
1699 list_entry(head->first, struct inet_timewait_sock, tw_node);
1702 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1704 return tw->tw_node.next ?
1705 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1708 static void *listening_get_next(struct seq_file *seq, void *cur)
1710 struct tcp_sock *tp;
1711 struct hlist_node *node;
1712 struct sock *sk = cur;
1713 struct tcp_iter_state* st = seq->private;
1717 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1723 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1724 struct request_sock *req = cur;
1726 tp = tcp_sk(st->syn_wait_sk);
1730 if (req->rsk_ops->family == st->family) {
1736 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1739 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1741 sk = sk_next(st->syn_wait_sk);
1742 st->state = TCP_SEQ_STATE_LISTENING;
1743 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1746 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1747 if (reqsk_queue_len(&tp->accept_queue))
1749 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1753 sk_for_each_from(sk, node) {
1754 if (sk->sk_family == st->family) {
1759 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1760 if (reqsk_queue_len(&tp->accept_queue)) {
1762 st->uid = sock_i_uid(sk);
1763 st->syn_wait_sk = sk;
1764 st->state = TCP_SEQ_STATE_OPENREQ;
1768 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1770 if (++st->bucket < INET_LHTABLE_SIZE) {
1771 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1779 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1781 void *rc = listening_get_next(seq, NULL);
1783 while (rc && *pos) {
1784 rc = listening_get_next(seq, rc);
1790 static void *established_get_first(struct seq_file *seq)
1792 struct tcp_iter_state* st = seq->private;
1795 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1797 struct hlist_node *node;
1798 struct inet_timewait_sock *tw;
1800 /* We can reschedule _before_ having picked the target: */
1801 cond_resched_softirq();
1803 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1804 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1805 if (sk->sk_family != st->family) {
1811 st->state = TCP_SEQ_STATE_TIME_WAIT;
1812 inet_twsk_for_each(tw, node,
1813 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1814 if (tw->tw_family != st->family) {
1820 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1821 st->state = TCP_SEQ_STATE_ESTABLISHED;
1827 static void *established_get_next(struct seq_file *seq, void *cur)
1829 struct sock *sk = cur;
1830 struct inet_timewait_sock *tw;
1831 struct hlist_node *node;
1832 struct tcp_iter_state* st = seq->private;
1836 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1840 while (tw && tw->tw_family != st->family) {
1847 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1848 st->state = TCP_SEQ_STATE_ESTABLISHED;
1850 /* We can reschedule between buckets: */
1851 cond_resched_softirq();
1853 if (++st->bucket < tcp_hashinfo.ehash_size) {
1854 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1855 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1863 sk_for_each_from(sk, node) {
1864 if (sk->sk_family == st->family)
1868 st->state = TCP_SEQ_STATE_TIME_WAIT;
1869 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1877 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1879 void *rc = established_get_first(seq);
1882 rc = established_get_next(seq, rc);
1888 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1891 struct tcp_iter_state* st = seq->private;
1893 inet_listen_lock(&tcp_hashinfo);
1894 st->state = TCP_SEQ_STATE_LISTENING;
1895 rc = listening_get_idx(seq, &pos);
1898 inet_listen_unlock(&tcp_hashinfo);
1900 st->state = TCP_SEQ_STATE_ESTABLISHED;
1901 rc = established_get_idx(seq, pos);
1907 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1909 struct tcp_iter_state* st = seq->private;
1910 st->state = TCP_SEQ_STATE_LISTENING;
1912 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1915 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1918 struct tcp_iter_state* st;
1920 if (v == SEQ_START_TOKEN) {
1921 rc = tcp_get_idx(seq, 0);
1926 switch (st->state) {
1927 case TCP_SEQ_STATE_OPENREQ:
1928 case TCP_SEQ_STATE_LISTENING:
1929 rc = listening_get_next(seq, v);
1931 inet_listen_unlock(&tcp_hashinfo);
1933 st->state = TCP_SEQ_STATE_ESTABLISHED;
1934 rc = established_get_first(seq);
1937 case TCP_SEQ_STATE_ESTABLISHED:
1938 case TCP_SEQ_STATE_TIME_WAIT:
1939 rc = established_get_next(seq, v);
1947 static void tcp_seq_stop(struct seq_file *seq, void *v)
1949 struct tcp_iter_state* st = seq->private;
1951 switch (st->state) {
1952 case TCP_SEQ_STATE_OPENREQ:
1954 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
1955 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1957 case TCP_SEQ_STATE_LISTENING:
1958 if (v != SEQ_START_TOKEN)
1959 inet_listen_unlock(&tcp_hashinfo);
1961 case TCP_SEQ_STATE_TIME_WAIT:
1962 case TCP_SEQ_STATE_ESTABLISHED:
1964 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1970 static int tcp_seq_open(struct inode *inode, struct file *file)
1972 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1973 struct seq_file *seq;
1974 struct tcp_iter_state *s;
1977 if (unlikely(afinfo == NULL))
1980 s = kmalloc(sizeof(*s), GFP_KERNEL);
1983 memset(s, 0, sizeof(*s));
1984 s->family = afinfo->family;
1985 s->seq_ops.start = tcp_seq_start;
1986 s->seq_ops.next = tcp_seq_next;
1987 s->seq_ops.show = afinfo->seq_show;
1988 s->seq_ops.stop = tcp_seq_stop;
1990 rc = seq_open(file, &s->seq_ops);
1993 seq = file->private_data;
2002 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2005 struct proc_dir_entry *p;
2009 afinfo->seq_fops->owner = afinfo->owner;
2010 afinfo->seq_fops->open = tcp_seq_open;
2011 afinfo->seq_fops->read = seq_read;
2012 afinfo->seq_fops->llseek = seq_lseek;
2013 afinfo->seq_fops->release = seq_release_private;
2015 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2023 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2027 proc_net_remove(afinfo->name);
2028 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2031 static void get_openreq4(struct sock *sk, struct request_sock *req,
2032 char *tmpbuf, int i, int uid)
2034 const struct inet_request_sock *ireq = inet_rsk(req);
2035 int ttd = req->expires - jiffies;
2037 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2038 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2041 ntohs(inet_sk(sk)->sport),
2043 ntohs(ireq->rmt_port),
2045 0, 0, /* could print option size, but that is af dependent. */
2046 1, /* timers active (only the expire timer) */
2047 jiffies_to_clock_t(ttd),
2050 0, /* non standard timer */
2051 0, /* open_requests have no inode */
2052 atomic_read(&sk->sk_refcnt),
2056 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2059 unsigned long timer_expires;
2060 struct tcp_sock *tp = tcp_sk(sp);
2061 struct inet_sock *inet = inet_sk(sp);
2062 unsigned int dest = inet->daddr;
2063 unsigned int src = inet->rcv_saddr;
2064 __u16 destp = ntohs(inet->dport);
2065 __u16 srcp = ntohs(inet->sport);
2067 if (tp->pending == TCP_TIME_RETRANS) {
2069 timer_expires = tp->timeout;
2070 } else if (tp->pending == TCP_TIME_PROBE0) {
2072 timer_expires = tp->timeout;
2073 } else if (timer_pending(&sp->sk_timer)) {
2075 timer_expires = sp->sk_timer.expires;
2078 timer_expires = jiffies;
2081 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2082 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2083 i, src, srcp, dest, destp, sp->sk_state,
2084 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2086 jiffies_to_clock_t(timer_expires - jiffies),
2091 atomic_read(&sp->sk_refcnt), sp,
2092 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2094 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2097 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2099 unsigned int dest, src;
2101 int ttd = tw->tw_ttd - jiffies;
2106 dest = tw->tw_daddr;
2107 src = tw->tw_rcv_saddr;
2108 destp = ntohs(tw->tw_dport);
2109 srcp = ntohs(tw->tw_sport);
2111 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2112 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2113 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2114 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2115 atomic_read(&tw->tw_refcnt), tw);
2120 static int tcp4_seq_show(struct seq_file *seq, void *v)
2122 struct tcp_iter_state* st;
2123 char tmpbuf[TMPSZ + 1];
2125 if (v == SEQ_START_TOKEN) {
2126 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2127 " sl local_address rem_address st tx_queue "
2128 "rx_queue tr tm->when retrnsmt uid timeout "
2134 switch (st->state) {
2135 case TCP_SEQ_STATE_LISTENING:
2136 case TCP_SEQ_STATE_ESTABLISHED:
2137 get_tcp4_sock(v, tmpbuf, st->num);
2139 case TCP_SEQ_STATE_OPENREQ:
2140 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2142 case TCP_SEQ_STATE_TIME_WAIT:
2143 get_timewait4_sock(v, tmpbuf, st->num);
2146 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2151 static struct file_operations tcp4_seq_fops;
2152 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2153 .owner = THIS_MODULE,
2156 .seq_show = tcp4_seq_show,
2157 .seq_fops = &tcp4_seq_fops,
2160 int __init tcp4_proc_init(void)
2162 return tcp_proc_register(&tcp4_seq_afinfo);
2165 void tcp4_proc_exit(void)
2167 tcp_proc_unregister(&tcp4_seq_afinfo);
2169 #endif /* CONFIG_PROC_FS */
2171 struct proto tcp_prot = {
2173 .owner = THIS_MODULE,
2175 .connect = tcp_v4_connect,
2176 .disconnect = tcp_disconnect,
2177 .accept = tcp_accept,
2179 .init = tcp_v4_init_sock,
2180 .destroy = tcp_v4_destroy_sock,
2181 .shutdown = tcp_shutdown,
2182 .setsockopt = tcp_setsockopt,
2183 .getsockopt = tcp_getsockopt,
2184 .sendmsg = tcp_sendmsg,
2185 .recvmsg = tcp_recvmsg,
2186 .backlog_rcv = tcp_v4_do_rcv,
2187 .hash = tcp_v4_hash,
2188 .unhash = tcp_unhash,
2189 .get_port = tcp_v4_get_port,
2190 .enter_memory_pressure = tcp_enter_memory_pressure,
2191 .sockets_allocated = &tcp_sockets_allocated,
2192 .memory_allocated = &tcp_memory_allocated,
2193 .memory_pressure = &tcp_memory_pressure,
2194 .sysctl_mem = sysctl_tcp_mem,
2195 .sysctl_wmem = sysctl_tcp_wmem,
2196 .sysctl_rmem = sysctl_tcp_rmem,
2197 .max_header = MAX_TCP_HEADER,
2198 .obj_size = sizeof(struct tcp_sock),
2199 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2200 .rsk_prot = &tcp_request_sock_ops,
2205 void __init tcp_v4_init(struct net_proto_family *ops)
2207 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2209 panic("Failed to create the TCP control socket.\n");
2210 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2211 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2213 /* Unhash it so that IP input processing does not even
2214 * see it, we do not wish this socket to see incoming
2217 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2220 EXPORT_SYMBOL(ipv4_specific);
2221 EXPORT_SYMBOL(inet_bind_bucket_create);
2222 EXPORT_SYMBOL(tcp_hashinfo);
2223 EXPORT_SYMBOL(tcp_prot);
2224 EXPORT_SYMBOL(tcp_unhash);
2225 EXPORT_SYMBOL(tcp_v4_conn_request);
2226 EXPORT_SYMBOL(tcp_v4_connect);
2227 EXPORT_SYMBOL(tcp_v4_do_rcv);
2228 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2229 EXPORT_SYMBOL(tcp_v4_send_check);
2230 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2232 #ifdef CONFIG_PROC_FS
2233 EXPORT_SYMBOL(tcp_proc_register);
2234 EXPORT_SYMBOL(tcp_proc_unregister);
2236 EXPORT_SYMBOL(sysctl_local_port_range);
2237 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2238 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);