2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen semantics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
65 #include <net/net_namespace.h>
67 #include <net/inet_hashtables.h>
69 #include <net/transp_v6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
74 #include <net/netdma.h>
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
93 #ifdef CONFIG_TCP_MD5SIG
94 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
96 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
97 __be32 saddr, __be32 daddr,
98 struct tcphdr *th, unsigned int tcplen);
101 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
107 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
108 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
109 .lhash_users = ATOMIC_INIT(0),
110 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
113 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
115 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
118 tcp_hdr(skb)->source);
121 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
123 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
124 struct tcp_sock *tp = tcp_sk(sk);
126 /* With PAWS, it is safe from the viewpoint
127 of data integrity. Even without PAWS it is safe provided sequence
128 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
130 Actually, the idea is close to VJ's one, only timestamp cache is
131 held not per host, but per port pair and TW bucket is used as state
134 If TW bucket has been already destroyed we fall back to VJ's scheme
135 and use initial timestamp retrieved from peer table.
137 if (tcptw->tw_ts_recent_stamp &&
138 (twp == NULL || (sysctl_tcp_tw_reuse &&
139 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
140 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
141 if (tp->write_seq == 0)
143 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
144 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
152 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
154 /* This will initiate an outgoing connection. */
155 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
157 struct inet_sock *inet = inet_sk(sk);
158 struct tcp_sock *tp = tcp_sk(sk);
159 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
161 __be32 daddr, nexthop;
165 if (addr_len < sizeof(struct sockaddr_in))
168 if (usin->sin_family != AF_INET)
169 return -EAFNOSUPPORT;
171 nexthop = daddr = usin->sin_addr.s_addr;
172 if (inet->opt && inet->opt->srr) {
175 nexthop = inet->opt->faddr;
178 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
179 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
181 inet->sport, usin->sin_port, sk, 1);
183 if (tmp == -ENETUNREACH)
184 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
188 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
193 if (!inet->opt || !inet->opt->srr)
197 inet->saddr = rt->rt_src;
198 inet->rcv_saddr = inet->saddr;
200 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
201 /* Reset inherited state */
202 tp->rx_opt.ts_recent = 0;
203 tp->rx_opt.ts_recent_stamp = 0;
207 if (tcp_death_row.sysctl_tw_recycle &&
208 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
209 struct inet_peer *peer = rt_get_peer(rt);
211 * VJ's idea. We save last timestamp seen from
212 * the destination in peer table, when entering state
213 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214 * when trying new connection.
217 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
218 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
219 tp->rx_opt.ts_recent = peer->tcp_ts;
223 inet->dport = usin->sin_port;
226 inet_csk(sk)->icsk_ext_hdr_len = 0;
228 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
230 tp->rx_opt.mss_clamp = 536;
232 /* Socket identity is still unknown (sport may be zero).
233 * However we set state to SYN-SENT and not releasing socket
234 * lock select source port, enter ourselves into the hash tables and
235 * complete initialization after this.
237 tcp_set_state(sk, TCP_SYN_SENT);
238 err = inet_hash_connect(&tcp_death_row, sk);
242 err = ip_route_newports(&rt, IPPROTO_TCP,
243 inet->sport, inet->dport, sk);
247 /* OK, now commit destination to socket. */
248 sk->sk_gso_type = SKB_GSO_TCPV4;
249 sk_setup_caps(sk, &rt->u.dst);
252 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
257 inet->id = tp->write_seq ^ jiffies;
259 err = tcp_connect(sk);
268 * This unhashes the socket and releases the local port,
271 tcp_set_state(sk, TCP_CLOSE);
273 sk->sk_route_caps = 0;
279 * This routine does path mtu discovery as defined in RFC1191.
281 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
283 struct dst_entry *dst;
284 struct inet_sock *inet = inet_sk(sk);
286 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
287 * send out by Linux are always <576bytes so they should go through
290 if (sk->sk_state == TCP_LISTEN)
293 /* We don't check in the destentry if pmtu discovery is forbidden
294 * on this route. We just assume that no packet_to_big packets
295 * are send back when pmtu discovery is not active.
296 * There is a small race when the user changes this flag in the
297 * route, but I think that's acceptable.
299 if ((dst = __sk_dst_check(sk, 0)) == NULL)
302 dst->ops->update_pmtu(dst, mtu);
304 /* Something is about to be wrong... Remember soft error
305 * for the case, if this connection will not able to recover.
307 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
308 sk->sk_err_soft = EMSGSIZE;
312 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
313 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
314 tcp_sync_mss(sk, mtu);
316 /* Resend the TCP packet because it's
317 * clear that the old packet has been
318 * dropped. This is the new "fast" path mtu
321 tcp_simple_retransmit(sk);
322 } /* else let the usual retransmit timer handle it */
326 * This routine is called by the ICMP module when it gets some
327 * sort of error condition. If err < 0 then the socket should
328 * be closed and the error returned to the user. If err > 0
329 * it's just the icmp type << 8 | icmp code. After adjustment
330 * header points to the first 8 bytes of the tcp header. We need
331 * to find the appropriate port.
333 * The locking strategy used here is very "optimistic". When
334 * someone else accesses the socket the ICMP is just dropped
335 * and for some paths there is no check at all.
336 * A more general error queue to queue errors for later handling
337 * is probably better.
341 void tcp_v4_err(struct sk_buff *skb, u32 info)
343 struct iphdr *iph = (struct iphdr *)skb->data;
344 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
346 struct inet_sock *inet;
347 const int type = icmp_hdr(skb)->type;
348 const int code = icmp_hdr(skb)->code;
353 if (skb->len < (iph->ihl << 2) + 8) {
354 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
358 sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
359 iph->saddr, th->source, inet_iif(skb));
361 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
364 if (sk->sk_state == TCP_TIME_WAIT) {
365 inet_twsk_put(inet_twsk(sk));
370 /* If too many ICMPs get dropped on busy
371 * servers this needs to be solved differently.
373 if (sock_owned_by_user(sk))
374 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
376 if (sk->sk_state == TCP_CLOSE)
380 seq = ntohl(th->seq);
381 if (sk->sk_state != TCP_LISTEN &&
382 !between(seq, tp->snd_una, tp->snd_nxt)) {
383 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
388 case ICMP_SOURCE_QUENCH:
389 /* Just silently ignore these. */
391 case ICMP_PARAMETERPROB:
394 case ICMP_DEST_UNREACH:
395 if (code > NR_ICMP_UNREACH)
398 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
399 if (!sock_owned_by_user(sk))
400 do_pmtu_discovery(sk, iph, info);
404 err = icmp_err_convert[code].errno;
406 case ICMP_TIME_EXCEEDED:
413 switch (sk->sk_state) {
414 struct request_sock *req, **prev;
416 if (sock_owned_by_user(sk))
419 req = inet_csk_search_req(sk, &prev, th->dest,
420 iph->daddr, iph->saddr);
424 /* ICMPs are not backlogged, hence we cannot get
425 an established socket here.
429 if (seq != tcp_rsk(req)->snt_isn) {
430 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
435 * Still in SYN_RECV, just remove it silently.
436 * There is no good way to pass the error to the newly
437 * created socket, and POSIX does not want network
438 * errors returned from accept().
440 inet_csk_reqsk_queue_drop(sk, req, prev);
444 case TCP_SYN_RECV: /* Cannot happen.
445 It can f.e. if SYNs crossed.
447 if (!sock_owned_by_user(sk)) {
450 sk->sk_error_report(sk);
454 sk->sk_err_soft = err;
459 /* If we've already connected we will keep trying
460 * until we time out, or the user gives up.
462 * rfc1122 4.2.3.9 allows to consider as hard errors
463 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
464 * but it is obsoleted by pmtu discovery).
466 * Note, that in modern internet, where routing is unreliable
467 * and in each dark corner broken firewalls sit, sending random
468 * errors ordered by their masters even this two messages finally lose
469 * their original sense (even Linux sends invalid PORT_UNREACHs)
471 * Now we are in compliance with RFCs.
476 if (!sock_owned_by_user(sk) && inet->recverr) {
478 sk->sk_error_report(sk);
479 } else { /* Only an error on timeout */
480 sk->sk_err_soft = err;
488 /* This routine computes an IPv4 TCP checksum. */
489 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
491 struct inet_sock *inet = inet_sk(sk);
492 struct tcphdr *th = tcp_hdr(skb);
494 if (skb->ip_summed == CHECKSUM_PARTIAL) {
495 th->check = ~tcp_v4_check(len, inet->saddr,
497 skb->csum_start = skb_transport_header(skb) - skb->head;
498 skb->csum_offset = offsetof(struct tcphdr, check);
500 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
501 csum_partial((char *)th,
507 int tcp_v4_gso_send_check(struct sk_buff *skb)
509 const struct iphdr *iph;
512 if (!pskb_may_pull(skb, sizeof(*th)))
519 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
520 skb->csum_start = skb_transport_header(skb) - skb->head;
521 skb->csum_offset = offsetof(struct tcphdr, check);
522 skb->ip_summed = CHECKSUM_PARTIAL;
527 * This routine will send an RST to the other tcp.
529 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
531 * Answer: if a packet caused RST, it is not for a socket
532 * existing in our system, if it is matched to a socket,
533 * it is just duplicate segment or bug in other side's TCP.
534 * So that we build reply only basing on parameters
535 * arrived with segment.
536 * Exception: precedence violation. We do not implement it in any case.
539 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
541 struct tcphdr *th = tcp_hdr(skb);
544 #ifdef CONFIG_TCP_MD5SIG
545 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
548 struct ip_reply_arg arg;
549 #ifdef CONFIG_TCP_MD5SIG
550 struct tcp_md5sig_key *key;
553 /* Never send a reset in response to a reset. */
557 if (skb->rtable->rt_type != RTN_LOCAL)
560 /* Swap the send and the receive. */
561 memset(&rep, 0, sizeof(rep));
562 rep.th.dest = th->source;
563 rep.th.source = th->dest;
564 rep.th.doff = sizeof(struct tcphdr) / 4;
568 rep.th.seq = th->ack_seq;
571 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
572 skb->len - (th->doff << 2));
575 memset(&arg, 0, sizeof(arg));
576 arg.iov[0].iov_base = (unsigned char *)&rep;
577 arg.iov[0].iov_len = sizeof(rep.th);
579 #ifdef CONFIG_TCP_MD5SIG
580 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
582 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
584 (TCPOPT_MD5SIG << 8) |
586 /* Update length and the length the header thinks exists */
587 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
588 rep.th.doff = arg.iov[0].iov_len / 4;
590 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
594 &rep.th, arg.iov[0].iov_len);
597 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
598 ip_hdr(skb)->saddr, /* XXX */
599 sizeof(struct tcphdr), IPPROTO_TCP, 0);
600 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
602 ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
603 &arg, arg.iov[0].iov_len);
605 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
606 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
609 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
610 outside socket context is ugly, certainly. What can I do?
613 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
614 u32 win, u32 ts, int oif,
615 struct tcp_md5sig_key *key)
617 struct tcphdr *th = tcp_hdr(skb);
620 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
621 #ifdef CONFIG_TCP_MD5SIG
622 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
626 struct ip_reply_arg arg;
628 memset(&rep.th, 0, sizeof(struct tcphdr));
629 memset(&arg, 0, sizeof(arg));
631 arg.iov[0].iov_base = (unsigned char *)&rep;
632 arg.iov[0].iov_len = sizeof(rep.th);
634 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635 (TCPOPT_TIMESTAMP << 8) |
637 rep.opt[1] = htonl(tcp_time_stamp);
638 rep.opt[2] = htonl(ts);
639 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
642 /* Swap the send and the receive. */
643 rep.th.dest = th->source;
644 rep.th.source = th->dest;
645 rep.th.doff = arg.iov[0].iov_len / 4;
646 rep.th.seq = htonl(seq);
647 rep.th.ack_seq = htonl(ack);
649 rep.th.window = htons(win);
651 #ifdef CONFIG_TCP_MD5SIG
653 int offset = (ts) ? 3 : 0;
655 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
657 (TCPOPT_MD5SIG << 8) |
659 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
660 rep.th.doff = arg.iov[0].iov_len/4;
662 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
666 &rep.th, arg.iov[0].iov_len);
669 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
670 ip_hdr(skb)->saddr, /* XXX */
671 arg.iov[0].iov_len, IPPROTO_TCP, 0);
672 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
674 arg.bound_dev_if = oif;
676 ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
677 &arg, arg.iov[0].iov_len);
679 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
682 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
684 struct inet_timewait_sock *tw = inet_twsk(sk);
685 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
687 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
688 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
691 tcp_twsk_md5_key(tcptw)
697 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
698 struct request_sock *req)
700 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
701 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
704 tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
708 * Send a SYN-ACK after having received a SYN.
709 * This still operates on a request_sock only, not on a big
712 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
713 struct dst_entry *dst)
715 const struct inet_request_sock *ireq = inet_rsk(req);
717 struct sk_buff * skb;
719 /* First, grab a route. */
720 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
723 skb = tcp_make_synack(sk, dst, req);
726 struct tcphdr *th = tcp_hdr(skb);
728 th->check = tcp_v4_check(skb->len,
731 csum_partial((char *)th, skb->len,
734 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
737 err = net_xmit_eval(err);
744 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
746 return __tcp_v4_send_synack(sk, req, NULL);
750 * IPv4 request_sock destructor.
752 static void tcp_v4_reqsk_destructor(struct request_sock *req)
754 kfree(inet_rsk(req)->opt);
757 #ifdef CONFIG_SYN_COOKIES
758 static void syn_flood_warning(struct sk_buff *skb)
760 static unsigned long warntime;
762 if (time_after(jiffies, (warntime + HZ * 60))) {
765 "possible SYN flooding on port %d. Sending cookies.\n",
766 ntohs(tcp_hdr(skb)->dest));
772 * Save and compile IPv4 options into the request_sock if needed.
774 static struct ip_options *tcp_v4_save_options(struct sock *sk,
777 struct ip_options *opt = &(IPCB(skb)->opt);
778 struct ip_options *dopt = NULL;
780 if (opt && opt->optlen) {
781 int opt_size = optlength(opt);
782 dopt = kmalloc(opt_size, GFP_ATOMIC);
784 if (ip_options_echo(dopt, skb)) {
793 #ifdef CONFIG_TCP_MD5SIG
795 * RFC2385 MD5 checksumming requires a mapping of
796 * IP address->MD5 Key.
797 * We need to maintain these in the sk structure.
800 /* Find the Key structure for an address. */
801 static struct tcp_md5sig_key *
802 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
804 struct tcp_sock *tp = tcp_sk(sk);
807 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
809 for (i = 0; i < tp->md5sig_info->entries4; i++) {
810 if (tp->md5sig_info->keys4[i].addr == addr)
811 return &tp->md5sig_info->keys4[i].base;
816 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
817 struct sock *addr_sk)
819 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
822 EXPORT_SYMBOL(tcp_v4_md5_lookup);
824 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
825 struct request_sock *req)
827 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
830 /* This can be called on a newly created socket, from other files */
831 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
832 u8 *newkey, u8 newkeylen)
834 /* Add Key to the list */
835 struct tcp_md5sig_key *key;
836 struct tcp_sock *tp = tcp_sk(sk);
837 struct tcp4_md5sig_key *keys;
839 key = tcp_v4_md5_do_lookup(sk, addr);
841 /* Pre-existing entry - just update that one. */
844 key->keylen = newkeylen;
846 struct tcp_md5sig_info *md5sig;
848 if (!tp->md5sig_info) {
849 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
851 if (!tp->md5sig_info) {
855 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
857 if (tcp_alloc_md5sig_pool() == NULL) {
861 md5sig = tp->md5sig_info;
863 if (md5sig->alloced4 == md5sig->entries4) {
864 keys = kmalloc((sizeof(*keys) *
865 (md5sig->entries4 + 1)), GFP_ATOMIC);
868 tcp_free_md5sig_pool();
872 if (md5sig->entries4)
873 memcpy(keys, md5sig->keys4,
874 sizeof(*keys) * md5sig->entries4);
876 /* Free old key list, and reference new one */
877 kfree(md5sig->keys4);
878 md5sig->keys4 = keys;
882 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
883 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
884 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
889 EXPORT_SYMBOL(tcp_v4_md5_do_add);
891 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
892 u8 *newkey, u8 newkeylen)
894 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
898 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
900 struct tcp_sock *tp = tcp_sk(sk);
903 for (i = 0; i < tp->md5sig_info->entries4; i++) {
904 if (tp->md5sig_info->keys4[i].addr == addr) {
906 kfree(tp->md5sig_info->keys4[i].base.key);
907 tp->md5sig_info->entries4--;
909 if (tp->md5sig_info->entries4 == 0) {
910 kfree(tp->md5sig_info->keys4);
911 tp->md5sig_info->keys4 = NULL;
912 tp->md5sig_info->alloced4 = 0;
913 } else if (tp->md5sig_info->entries4 != i) {
914 /* Need to do some manipulation */
915 memmove(&tp->md5sig_info->keys4[i],
916 &tp->md5sig_info->keys4[i+1],
917 (tp->md5sig_info->entries4 - i) *
918 sizeof(struct tcp4_md5sig_key));
920 tcp_free_md5sig_pool();
927 EXPORT_SYMBOL(tcp_v4_md5_do_del);
929 static void tcp_v4_clear_md5_list(struct sock *sk)
931 struct tcp_sock *tp = tcp_sk(sk);
933 /* Free each key, then the set of key keys,
934 * the crypto element, and then decrement our
935 * hold on the last resort crypto.
937 if (tp->md5sig_info->entries4) {
939 for (i = 0; i < tp->md5sig_info->entries4; i++)
940 kfree(tp->md5sig_info->keys4[i].base.key);
941 tp->md5sig_info->entries4 = 0;
942 tcp_free_md5sig_pool();
944 if (tp->md5sig_info->keys4) {
945 kfree(tp->md5sig_info->keys4);
946 tp->md5sig_info->keys4 = NULL;
947 tp->md5sig_info->alloced4 = 0;
951 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
954 struct tcp_md5sig cmd;
955 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
958 if (optlen < sizeof(cmd))
961 if (copy_from_user(&cmd, optval, sizeof(cmd)))
964 if (sin->sin_family != AF_INET)
967 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
968 if (!tcp_sk(sk)->md5sig_info)
970 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
973 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
976 if (!tcp_sk(sk)->md5sig_info) {
977 struct tcp_sock *tp = tcp_sk(sk);
978 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
984 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
987 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
990 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
991 newkey, cmd.tcpm_keylen);
994 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
995 __be32 saddr, __be32 daddr,
999 struct tcp_md5sig_pool *hp;
1000 struct tcp4_pseudohdr *bp;
1004 * Okay, so RFC2385 is turned on for this connection,
1005 * so we need to generate the MD5 hash for the packet now.
1008 hp = tcp_get_md5sig_pool();
1010 goto clear_hash_noput;
1012 bp = &hp->md5_blk.ip4;
1015 * The TCP pseudo-header (in the order: source IP address,
1016 * destination IP address, zero-padded protocol number, and
1022 bp->protocol = IPPROTO_TCP;
1023 bp->len = htons(tcplen);
1025 err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1030 /* Free up the crypto pool */
1031 tcp_put_md5sig_pool();
1035 tcp_put_md5sig_pool();
1037 memset(md5_hash, 0, 16);
1041 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1043 struct dst_entry *dst,
1044 struct request_sock *req,
1046 unsigned int tcplen)
1048 __be32 saddr, daddr;
1051 saddr = inet_sk(sk)->saddr;
1052 daddr = inet_sk(sk)->daddr;
1054 struct rtable *rt = (struct rtable *)dst;
1059 return tcp_v4_do_calc_md5_hash(md5_hash, key,
1064 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1066 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1069 * This gets called for each TCP segment that arrives
1070 * so we want to be efficient.
1071 * We have 3 drop cases:
1072 * o No MD5 hash and one expected.
1073 * o MD5 hash and we're not expecting one.
1074 * o MD5 hash and its wrong.
1076 __u8 *hash_location = NULL;
1077 struct tcp_md5sig_key *hash_expected;
1078 const struct iphdr *iph = ip_hdr(skb);
1079 struct tcphdr *th = tcp_hdr(skb);
1081 unsigned char newhash[16];
1083 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1084 hash_location = tcp_parse_md5sig_option(th);
1086 /* We've parsed the options - do we have a hash? */
1087 if (!hash_expected && !hash_location)
1090 if (hash_expected && !hash_location) {
1091 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1092 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1093 NIPQUAD(iph->saddr), ntohs(th->source),
1094 NIPQUAD(iph->daddr), ntohs(th->dest));
1098 if (!hash_expected && hash_location) {
1099 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1100 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1101 NIPQUAD(iph->saddr), ntohs(th->source),
1102 NIPQUAD(iph->daddr), ntohs(th->dest));
1106 /* Okay, so this is hash_expected and hash_location -
1107 * so we need to calculate the checksum.
1109 genhash = tcp_v4_do_calc_md5_hash(newhash,
1111 iph->saddr, iph->daddr,
1114 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1115 if (net_ratelimit()) {
1116 printk(KERN_INFO "MD5 Hash failed for "
1117 "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1118 NIPQUAD(iph->saddr), ntohs(th->source),
1119 NIPQUAD(iph->daddr), ntohs(th->dest),
1120 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1129 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1131 .obj_size = sizeof(struct tcp_request_sock),
1132 .rtx_syn_ack = tcp_v4_send_synack,
1133 .send_ack = tcp_v4_reqsk_send_ack,
1134 .destructor = tcp_v4_reqsk_destructor,
1135 .send_reset = tcp_v4_send_reset,
1138 #ifdef CONFIG_TCP_MD5SIG
1139 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1140 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1144 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1145 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1146 .twsk_unique = tcp_twsk_unique,
1147 .twsk_destructor= tcp_twsk_destructor,
1150 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1152 struct inet_request_sock *ireq;
1153 struct tcp_options_received tmp_opt;
1154 struct request_sock *req;
1155 __be32 saddr = ip_hdr(skb)->saddr;
1156 __be32 daddr = ip_hdr(skb)->daddr;
1157 __u32 isn = TCP_SKB_CB(skb)->when;
1158 struct dst_entry *dst = NULL;
1159 #ifdef CONFIG_SYN_COOKIES
1160 int want_cookie = 0;
1162 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1165 /* Never answer to SYNs send to broadcast or multicast */
1166 if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1169 /* TW buckets are converted to open requests without
1170 * limitations, they conserve resources and peer is
1171 * evidently real one.
1173 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1174 #ifdef CONFIG_SYN_COOKIES
1175 if (sysctl_tcp_syncookies) {
1182 /* Accept backlog is full. If we have already queued enough
1183 * of warm entries in syn queue, drop request. It is better than
1184 * clogging syn queue with openreqs with exponentially increasing
1187 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1190 req = reqsk_alloc(&tcp_request_sock_ops);
1194 #ifdef CONFIG_TCP_MD5SIG
1195 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1198 tcp_clear_options(&tmp_opt);
1199 tmp_opt.mss_clamp = 536;
1200 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1202 tcp_parse_options(skb, &tmp_opt, 0);
1204 if (want_cookie && !tmp_opt.saw_tstamp)
1205 tcp_clear_options(&tmp_opt);
1207 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1208 /* Some OSes (unknown ones, but I see them on web server, which
1209 * contains information interesting only for windows'
1210 * users) do not send their stamp in SYN. It is easy case.
1211 * We simply do not advertise TS support.
1213 tmp_opt.saw_tstamp = 0;
1214 tmp_opt.tstamp_ok = 0;
1216 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1218 tcp_openreq_init(req, &tmp_opt, skb);
1220 if (security_inet_conn_request(sk, skb, req))
1223 ireq = inet_rsk(req);
1224 ireq->loc_addr = daddr;
1225 ireq->rmt_addr = saddr;
1226 ireq->opt = tcp_v4_save_options(sk, skb);
1228 TCP_ECN_create_request(req, tcp_hdr(skb));
1231 #ifdef CONFIG_SYN_COOKIES
1232 syn_flood_warning(skb);
1233 req->cookie_ts = tmp_opt.tstamp_ok;
1235 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1237 struct inet_peer *peer = NULL;
1239 /* VJ's idea. We save last timestamp seen
1240 * from the destination in peer table, when entering
1241 * state TIME-WAIT, and check against it before
1242 * accepting new connection request.
1244 * If "isn" is not zero, this request hit alive
1245 * timewait bucket, so that all the necessary checks
1246 * are made in the function processing timewait state.
1248 if (tmp_opt.saw_tstamp &&
1249 tcp_death_row.sysctl_tw_recycle &&
1250 (dst = inet_csk_route_req(sk, req)) != NULL &&
1251 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1252 peer->v4daddr == saddr) {
1253 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1254 (s32)(peer->tcp_ts - req->ts_recent) >
1256 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1257 goto drop_and_release;
1260 /* Kill the following clause, if you dislike this way. */
1261 else if (!sysctl_tcp_syncookies &&
1262 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1263 (sysctl_max_syn_backlog >> 2)) &&
1264 (!peer || !peer->tcp_ts_stamp) &&
1265 (!dst || !dst_metric(dst, RTAX_RTT))) {
1266 /* Without syncookies last quarter of
1267 * backlog is filled with destinations,
1268 * proven to be alive.
1269 * It means that we continue to communicate
1270 * to destinations, already remembered
1271 * to the moment of synflood.
1273 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1274 "request from " NIPQUAD_FMT "/%u\n",
1276 ntohs(tcp_hdr(skb)->source));
1277 goto drop_and_release;
1280 isn = tcp_v4_init_sequence(skb);
1282 tcp_rsk(req)->snt_isn = isn;
1284 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1287 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1300 * The three way handshake has completed - we got a valid synack -
1301 * now create the new socket.
1303 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1304 struct request_sock *req,
1305 struct dst_entry *dst)
1307 struct inet_request_sock *ireq;
1308 struct inet_sock *newinet;
1309 struct tcp_sock *newtp;
1311 #ifdef CONFIG_TCP_MD5SIG
1312 struct tcp_md5sig_key *key;
1315 if (sk_acceptq_is_full(sk))
1318 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1321 newsk = tcp_create_openreq_child(sk, req, skb);
1325 newsk->sk_gso_type = SKB_GSO_TCPV4;
1326 sk_setup_caps(newsk, dst);
1328 newtp = tcp_sk(newsk);
1329 newinet = inet_sk(newsk);
1330 ireq = inet_rsk(req);
1331 newinet->daddr = ireq->rmt_addr;
1332 newinet->rcv_saddr = ireq->loc_addr;
1333 newinet->saddr = ireq->loc_addr;
1334 newinet->opt = ireq->opt;
1336 newinet->mc_index = inet_iif(skb);
1337 newinet->mc_ttl = ip_hdr(skb)->ttl;
1338 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1340 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1341 newinet->id = newtp->write_seq ^ jiffies;
1343 tcp_mtup_init(newsk);
1344 tcp_sync_mss(newsk, dst_mtu(dst));
1345 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1346 tcp_initialize_rcv_mss(newsk);
1348 #ifdef CONFIG_TCP_MD5SIG
1349 /* Copy over the MD5 key from the original socket */
1350 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1352 * We're using one, so create a matching key
1353 * on the newsk structure. If we fail to get
1354 * memory, then we end up not copying the key
1357 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1359 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1360 newkey, key->keylen);
1364 __inet_hash_nolisten(newsk);
1365 __inet_inherit_port(sk, newsk);
1370 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1372 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1377 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1379 struct tcphdr *th = tcp_hdr(skb);
1380 const struct iphdr *iph = ip_hdr(skb);
1382 struct request_sock **prev;
1383 /* Find possible connection requests. */
1384 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1385 iph->saddr, iph->daddr);
1387 return tcp_check_req(sk, skb, req, prev);
1389 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1390 th->source, iph->daddr, th->dest, inet_iif(skb));
1393 if (nsk->sk_state != TCP_TIME_WAIT) {
1397 inet_twsk_put(inet_twsk(nsk));
1401 #ifdef CONFIG_SYN_COOKIES
1402 if (!th->rst && !th->syn && th->ack)
1403 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1408 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1410 const struct iphdr *iph = ip_hdr(skb);
1412 if (skb->ip_summed == CHECKSUM_COMPLETE) {
1413 if (!tcp_v4_check(skb->len, iph->saddr,
1414 iph->daddr, skb->csum)) {
1415 skb->ip_summed = CHECKSUM_UNNECESSARY;
1420 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1421 skb->len, IPPROTO_TCP, 0);
1423 if (skb->len <= 76) {
1424 return __skb_checksum_complete(skb);
1430 /* The socket must have it's spinlock held when we get
1433 * We have a potential double-lock case here, so even when
1434 * doing backlog processing we use the BH locking scheme.
1435 * This is because we cannot sleep with the original spinlock
1438 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1441 #ifdef CONFIG_TCP_MD5SIG
1443 * We really want to reject the packet as early as possible
1445 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1446 * o There is an MD5 option and we're not expecting one
1448 if (tcp_v4_inbound_md5_hash(sk, skb))
1452 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1453 TCP_CHECK_TIMER(sk);
1454 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1458 TCP_CHECK_TIMER(sk);
1462 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1465 if (sk->sk_state == TCP_LISTEN) {
1466 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1471 if (tcp_child_process(sk, nsk, skb)) {
1479 TCP_CHECK_TIMER(sk);
1480 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1484 TCP_CHECK_TIMER(sk);
1488 tcp_v4_send_reset(rsk, skb);
1491 /* Be careful here. If this function gets more complicated and
1492 * gcc suffers from register pressure on the x86, sk (in %ebx)
1493 * might be destroyed here. This current version compiles correctly,
1494 * but you have been warned.
1499 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1507 int tcp_v4_rcv(struct sk_buff *skb)
1509 const struct iphdr *iph;
1514 if (skb->pkt_type != PACKET_HOST)
1517 /* Count it even if it's bad */
1518 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1520 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1525 if (th->doff < sizeof(struct tcphdr) / 4)
1527 if (!pskb_may_pull(skb, th->doff * 4))
1530 /* An explanation is required here, I think.
1531 * Packet length and doff are validated by header prediction,
1532 * provided case of th->doff==0 is eliminated.
1533 * So, we defer the checks. */
1534 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1539 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1540 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1541 skb->len - th->doff * 4);
1542 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1543 TCP_SKB_CB(skb)->when = 0;
1544 TCP_SKB_CB(skb)->flags = iph->tos;
1545 TCP_SKB_CB(skb)->sacked = 0;
1547 sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1548 th->source, iph->daddr, th->dest, inet_iif(skb));
1553 if (sk->sk_state == TCP_TIME_WAIT)
1556 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1557 goto discard_and_relse;
1560 if (sk_filter(sk, skb))
1561 goto discard_and_relse;
1565 bh_lock_sock_nested(sk);
1567 if (!sock_owned_by_user(sk)) {
1568 #ifdef CONFIG_NET_DMA
1569 struct tcp_sock *tp = tcp_sk(sk);
1570 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1571 tp->ucopy.dma_chan = get_softnet_dma();
1572 if (tp->ucopy.dma_chan)
1573 ret = tcp_v4_do_rcv(sk, skb);
1577 if (!tcp_prequeue(sk, skb))
1578 ret = tcp_v4_do_rcv(sk, skb);
1581 sk_add_backlog(sk, skb);
1589 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1592 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1594 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1596 tcp_v4_send_reset(NULL, skb);
1600 /* Discard frame. */
1609 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1610 inet_twsk_put(inet_twsk(sk));
1614 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1615 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1616 inet_twsk_put(inet_twsk(sk));
1619 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1621 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1623 iph->daddr, th->dest,
1626 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1627 inet_twsk_put(inet_twsk(sk));
1631 /* Fall through to ACK */
1634 tcp_v4_timewait_ack(sk, skb);
1638 case TCP_TW_SUCCESS:;
1643 /* VJ's idea. Save last timestamp seen from this destination
1644 * and hold it at least for normal timewait interval to use for duplicate
1645 * segment detection in subsequent connections, before they enter synchronized
1649 int tcp_v4_remember_stamp(struct sock *sk)
1651 struct inet_sock *inet = inet_sk(sk);
1652 struct tcp_sock *tp = tcp_sk(sk);
1653 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1654 struct inet_peer *peer = NULL;
1657 if (!rt || rt->rt_dst != inet->daddr) {
1658 peer = inet_getpeer(inet->daddr, 1);
1662 rt_bind_peer(rt, 1);
1667 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1668 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1669 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1670 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1671 peer->tcp_ts = tp->rx_opt.ts_recent;
1681 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1683 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1686 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1688 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1689 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1690 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1691 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1692 peer->tcp_ts = tcptw->tw_ts_recent;
1701 struct inet_connection_sock_af_ops ipv4_specific = {
1702 .queue_xmit = ip_queue_xmit,
1703 .send_check = tcp_v4_send_check,
1704 .rebuild_header = inet_sk_rebuild_header,
1705 .conn_request = tcp_v4_conn_request,
1706 .syn_recv_sock = tcp_v4_syn_recv_sock,
1707 .remember_stamp = tcp_v4_remember_stamp,
1708 .net_header_len = sizeof(struct iphdr),
1709 .setsockopt = ip_setsockopt,
1710 .getsockopt = ip_getsockopt,
1711 .addr2sockaddr = inet_csk_addr2sockaddr,
1712 .sockaddr_len = sizeof(struct sockaddr_in),
1713 .bind_conflict = inet_csk_bind_conflict,
1714 #ifdef CONFIG_COMPAT
1715 .compat_setsockopt = compat_ip_setsockopt,
1716 .compat_getsockopt = compat_ip_getsockopt,
1720 #ifdef CONFIG_TCP_MD5SIG
1721 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1722 .md5_lookup = tcp_v4_md5_lookup,
1723 .calc_md5_hash = tcp_v4_calc_md5_hash,
1724 .md5_add = tcp_v4_md5_add_func,
1725 .md5_parse = tcp_v4_parse_md5_keys,
1729 /* NOTE: A lot of things set to zero explicitly by call to
1730 * sk_alloc() so need not be done here.
1732 static int tcp_v4_init_sock(struct sock *sk)
1734 struct inet_connection_sock *icsk = inet_csk(sk);
1735 struct tcp_sock *tp = tcp_sk(sk);
1737 skb_queue_head_init(&tp->out_of_order_queue);
1738 tcp_init_xmit_timers(sk);
1739 tcp_prequeue_init(tp);
1741 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1742 tp->mdev = TCP_TIMEOUT_INIT;
1744 /* So many TCP implementations out there (incorrectly) count the
1745 * initial SYN frame in their delayed-ACK and congestion control
1746 * algorithms that we must have the following bandaid to talk
1747 * efficiently to them. -DaveM
1751 /* See draft-stevens-tcpca-spec-01 for discussion of the
1752 * initialization of these values.
1754 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1755 tp->snd_cwnd_clamp = ~0;
1756 tp->mss_cache = 536;
1758 tp->reordering = sysctl_tcp_reordering;
1759 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1761 sk->sk_state = TCP_CLOSE;
1763 sk->sk_write_space = sk_stream_write_space;
1764 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1766 icsk->icsk_af_ops = &ipv4_specific;
1767 icsk->icsk_sync_mss = tcp_sync_mss;
1768 #ifdef CONFIG_TCP_MD5SIG
1769 tp->af_specific = &tcp_sock_ipv4_specific;
1772 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1773 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1775 atomic_inc(&tcp_sockets_allocated);
1780 int tcp_v4_destroy_sock(struct sock *sk)
1782 struct tcp_sock *tp = tcp_sk(sk);
1784 tcp_clear_xmit_timers(sk);
1786 tcp_cleanup_congestion_control(sk);
1788 /* Cleanup up the write buffer. */
1789 tcp_write_queue_purge(sk);
1791 /* Cleans up our, hopefully empty, out_of_order_queue. */
1792 __skb_queue_purge(&tp->out_of_order_queue);
1794 #ifdef CONFIG_TCP_MD5SIG
1795 /* Clean up the MD5 key list, if any */
1796 if (tp->md5sig_info) {
1797 tcp_v4_clear_md5_list(sk);
1798 kfree(tp->md5sig_info);
1799 tp->md5sig_info = NULL;
1803 #ifdef CONFIG_NET_DMA
1804 /* Cleans up our sk_async_wait_queue */
1805 __skb_queue_purge(&sk->sk_async_wait_queue);
1808 /* Clean prequeue, it must be empty really */
1809 __skb_queue_purge(&tp->ucopy.prequeue);
1811 /* Clean up a referenced TCP bind bucket. */
1812 if (inet_csk(sk)->icsk_bind_hash)
1816 * If sendmsg cached page exists, toss it.
1818 if (sk->sk_sndmsg_page) {
1819 __free_page(sk->sk_sndmsg_page);
1820 sk->sk_sndmsg_page = NULL;
1823 if (tp->defer_tcp_accept.request) {
1824 reqsk_free(tp->defer_tcp_accept.request);
1825 sock_put(tp->defer_tcp_accept.listen_sk);
1827 tp->defer_tcp_accept.listen_sk = NULL;
1828 tp->defer_tcp_accept.request = NULL;
1831 atomic_dec(&tcp_sockets_allocated);
1836 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1838 #ifdef CONFIG_PROC_FS
1839 /* Proc filesystem TCP sock list dumping. */
1841 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1843 return hlist_empty(head) ? NULL :
1844 list_entry(head->first, struct inet_timewait_sock, tw_node);
1847 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1849 return tw->tw_node.next ?
1850 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1853 static void *listening_get_next(struct seq_file *seq, void *cur)
1855 struct inet_connection_sock *icsk;
1856 struct hlist_node *node;
1857 struct sock *sk = cur;
1858 struct tcp_iter_state* st = seq->private;
1859 struct net *net = seq_file_net(seq);
1863 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1869 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1870 struct request_sock *req = cur;
1872 icsk = inet_csk(st->syn_wait_sk);
1876 if (req->rsk_ops->family == st->family &&
1877 net_eq(sock_net(req->sk), net)) {
1883 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1886 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1888 sk = sk_next(st->syn_wait_sk);
1889 st->state = TCP_SEQ_STATE_LISTENING;
1890 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1892 icsk = inet_csk(sk);
1893 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1894 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1896 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1900 sk_for_each_from(sk, node) {
1901 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1905 icsk = inet_csk(sk);
1906 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1909 st->uid = sock_i_uid(sk);
1910 st->syn_wait_sk = sk;
1911 st->state = TCP_SEQ_STATE_OPENREQ;
1915 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1917 if (++st->bucket < INET_LHTABLE_SIZE) {
1918 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1926 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1928 void *rc = listening_get_next(seq, NULL);
1930 while (rc && *pos) {
1931 rc = listening_get_next(seq, rc);
1937 static void *established_get_first(struct seq_file *seq)
1939 struct tcp_iter_state* st = seq->private;
1940 struct net *net = seq_file_net(seq);
1943 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1945 struct hlist_node *node;
1946 struct inet_timewait_sock *tw;
1947 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1950 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1951 if (sk->sk_family != st->family ||
1952 !net_eq(sock_net(sk), net)) {
1958 st->state = TCP_SEQ_STATE_TIME_WAIT;
1959 inet_twsk_for_each(tw, node,
1960 &tcp_hashinfo.ehash[st->bucket].twchain) {
1961 if (tw->tw_family != st->family ||
1962 !net_eq(twsk_net(tw), net)) {
1968 read_unlock_bh(lock);
1969 st->state = TCP_SEQ_STATE_ESTABLISHED;
1975 static void *established_get_next(struct seq_file *seq, void *cur)
1977 struct sock *sk = cur;
1978 struct inet_timewait_sock *tw;
1979 struct hlist_node *node;
1980 struct tcp_iter_state* st = seq->private;
1981 struct net *net = seq_file_net(seq);
1985 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1989 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1996 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1997 st->state = TCP_SEQ_STATE_ESTABLISHED;
1999 if (++st->bucket < tcp_hashinfo.ehash_size) {
2000 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2001 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2009 sk_for_each_from(sk, node) {
2010 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2014 st->state = TCP_SEQ_STATE_TIME_WAIT;
2015 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2023 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2025 void *rc = established_get_first(seq);
2028 rc = established_get_next(seq, rc);
2034 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2037 struct tcp_iter_state* st = seq->private;
2039 inet_listen_lock(&tcp_hashinfo);
2040 st->state = TCP_SEQ_STATE_LISTENING;
2041 rc = listening_get_idx(seq, &pos);
2044 inet_listen_unlock(&tcp_hashinfo);
2045 st->state = TCP_SEQ_STATE_ESTABLISHED;
2046 rc = established_get_idx(seq, pos);
2052 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2054 struct tcp_iter_state* st = seq->private;
2055 st->state = TCP_SEQ_STATE_LISTENING;
2057 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2060 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2063 struct tcp_iter_state* st;
2065 if (v == SEQ_START_TOKEN) {
2066 rc = tcp_get_idx(seq, 0);
2071 switch (st->state) {
2072 case TCP_SEQ_STATE_OPENREQ:
2073 case TCP_SEQ_STATE_LISTENING:
2074 rc = listening_get_next(seq, v);
2076 inet_listen_unlock(&tcp_hashinfo);
2077 st->state = TCP_SEQ_STATE_ESTABLISHED;
2078 rc = established_get_first(seq);
2081 case TCP_SEQ_STATE_ESTABLISHED:
2082 case TCP_SEQ_STATE_TIME_WAIT:
2083 rc = established_get_next(seq, v);
2091 static void tcp_seq_stop(struct seq_file *seq, void *v)
2093 struct tcp_iter_state* st = seq->private;
2095 switch (st->state) {
2096 case TCP_SEQ_STATE_OPENREQ:
2098 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2099 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2101 case TCP_SEQ_STATE_LISTENING:
2102 if (v != SEQ_START_TOKEN)
2103 inet_listen_unlock(&tcp_hashinfo);
2105 case TCP_SEQ_STATE_TIME_WAIT:
2106 case TCP_SEQ_STATE_ESTABLISHED:
2108 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2113 static int tcp_seq_open(struct inode *inode, struct file *file)
2115 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2116 struct tcp_iter_state *s;
2119 err = seq_open_net(inode, file, &afinfo->seq_ops,
2120 sizeof(struct tcp_iter_state));
2124 s = ((struct seq_file *)file->private_data)->private;
2125 s->family = afinfo->family;
2129 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2132 struct proc_dir_entry *p;
2134 afinfo->seq_fops.open = tcp_seq_open;
2135 afinfo->seq_fops.read = seq_read;
2136 afinfo->seq_fops.llseek = seq_lseek;
2137 afinfo->seq_fops.release = seq_release_net;
2139 afinfo->seq_ops.start = tcp_seq_start;
2140 afinfo->seq_ops.next = tcp_seq_next;
2141 afinfo->seq_ops.stop = tcp_seq_stop;
2143 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2144 &afinfo->seq_fops, afinfo);
2150 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2152 proc_net_remove(net, afinfo->name);
2155 static void get_openreq4(struct sock *sk, struct request_sock *req,
2156 struct seq_file *f, int i, int uid, int *len)
2158 const struct inet_request_sock *ireq = inet_rsk(req);
2159 int ttd = req->expires - jiffies;
2161 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2162 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2165 ntohs(inet_sk(sk)->sport),
2167 ntohs(ireq->rmt_port),
2169 0, 0, /* could print option size, but that is af dependent. */
2170 1, /* timers active (only the expire timer) */
2171 jiffies_to_clock_t(ttd),
2174 0, /* non standard timer */
2175 0, /* open_requests have no inode */
2176 atomic_read(&sk->sk_refcnt),
2181 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2184 unsigned long timer_expires;
2185 struct tcp_sock *tp = tcp_sk(sk);
2186 const struct inet_connection_sock *icsk = inet_csk(sk);
2187 struct inet_sock *inet = inet_sk(sk);
2188 __be32 dest = inet->daddr;
2189 __be32 src = inet->rcv_saddr;
2190 __u16 destp = ntohs(inet->dport);
2191 __u16 srcp = ntohs(inet->sport);
2193 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2195 timer_expires = icsk->icsk_timeout;
2196 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2198 timer_expires = icsk->icsk_timeout;
2199 } else if (timer_pending(&sk->sk_timer)) {
2201 timer_expires = sk->sk_timer.expires;
2204 timer_expires = jiffies;
2207 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2208 "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2209 i, src, srcp, dest, destp, sk->sk_state,
2210 tp->write_seq - tp->snd_una,
2211 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2212 (tp->rcv_nxt - tp->copied_seq),
2214 jiffies_to_clock_t(timer_expires - jiffies),
2215 icsk->icsk_retransmits,
2217 icsk->icsk_probes_out,
2219 atomic_read(&sk->sk_refcnt), sk,
2222 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2224 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2228 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2229 struct seq_file *f, int i, int *len)
2233 int ttd = tw->tw_ttd - jiffies;
2238 dest = tw->tw_daddr;
2239 src = tw->tw_rcv_saddr;
2240 destp = ntohs(tw->tw_dport);
2241 srcp = ntohs(tw->tw_sport);
2243 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2244 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2245 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2246 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2247 atomic_read(&tw->tw_refcnt), tw, len);
2252 static int tcp4_seq_show(struct seq_file *seq, void *v)
2254 struct tcp_iter_state* st;
2257 if (v == SEQ_START_TOKEN) {
2258 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2259 " sl local_address rem_address st tx_queue "
2260 "rx_queue tr tm->when retrnsmt uid timeout "
2266 switch (st->state) {
2267 case TCP_SEQ_STATE_LISTENING:
2268 case TCP_SEQ_STATE_ESTABLISHED:
2269 get_tcp4_sock(v, seq, st->num, &len);
2271 case TCP_SEQ_STATE_OPENREQ:
2272 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2274 case TCP_SEQ_STATE_TIME_WAIT:
2275 get_timewait4_sock(v, seq, st->num, &len);
2278 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2283 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2287 .owner = THIS_MODULE,
2290 .show = tcp4_seq_show,
2294 static int tcp4_proc_init_net(struct net *net)
2296 return tcp_proc_register(net, &tcp4_seq_afinfo);
2299 static void tcp4_proc_exit_net(struct net *net)
2301 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2304 static struct pernet_operations tcp4_net_ops = {
2305 .init = tcp4_proc_init_net,
2306 .exit = tcp4_proc_exit_net,
2309 int __init tcp4_proc_init(void)
2311 return register_pernet_subsys(&tcp4_net_ops);
2314 void tcp4_proc_exit(void)
2316 unregister_pernet_subsys(&tcp4_net_ops);
2318 #endif /* CONFIG_PROC_FS */
2320 struct proto tcp_prot = {
2322 .owner = THIS_MODULE,
2324 .connect = tcp_v4_connect,
2325 .disconnect = tcp_disconnect,
2326 .accept = inet_csk_accept,
2328 .init = tcp_v4_init_sock,
2329 .destroy = tcp_v4_destroy_sock,
2330 .shutdown = tcp_shutdown,
2331 .setsockopt = tcp_setsockopt,
2332 .getsockopt = tcp_getsockopt,
2333 .recvmsg = tcp_recvmsg,
2334 .backlog_rcv = tcp_v4_do_rcv,
2336 .unhash = inet_unhash,
2337 .get_port = inet_csk_get_port,
2338 .enter_memory_pressure = tcp_enter_memory_pressure,
2339 .sockets_allocated = &tcp_sockets_allocated,
2340 .orphan_count = &tcp_orphan_count,
2341 .memory_allocated = &tcp_memory_allocated,
2342 .memory_pressure = &tcp_memory_pressure,
2343 .sysctl_mem = sysctl_tcp_mem,
2344 .sysctl_wmem = sysctl_tcp_wmem,
2345 .sysctl_rmem = sysctl_tcp_rmem,
2346 .max_header = MAX_TCP_HEADER,
2347 .obj_size = sizeof(struct tcp_sock),
2348 .twsk_prot = &tcp_timewait_sock_ops,
2349 .rsk_prot = &tcp_request_sock_ops,
2350 .h.hashinfo = &tcp_hashinfo,
2351 #ifdef CONFIG_COMPAT
2352 .compat_setsockopt = compat_tcp_setsockopt,
2353 .compat_getsockopt = compat_tcp_getsockopt,
2358 static int __net_init tcp_sk_init(struct net *net)
2360 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2361 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2364 static void __net_exit tcp_sk_exit(struct net *net)
2366 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2369 static struct pernet_operations __net_initdata tcp_sk_ops = {
2370 .init = tcp_sk_init,
2371 .exit = tcp_sk_exit,
2374 void __init tcp_v4_init(void)
2376 if (register_pernet_device(&tcp_sk_ops))
2377 panic("Failed to create the TCP control socket.\n");
2380 EXPORT_SYMBOL(ipv4_specific);
2381 EXPORT_SYMBOL(tcp_hashinfo);
2382 EXPORT_SYMBOL(tcp_prot);
2383 EXPORT_SYMBOL(tcp_v4_conn_request);
2384 EXPORT_SYMBOL(tcp_v4_connect);
2385 EXPORT_SYMBOL(tcp_v4_do_rcv);
2386 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2387 EXPORT_SYMBOL(tcp_v4_send_check);
2388 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2390 #ifdef CONFIG_PROC_FS
2391 EXPORT_SYMBOL(tcp_proc_register);
2392 EXPORT_SYMBOL(tcp_proc_unregister);
2394 EXPORT_SYMBOL(sysctl_tcp_low_latency);