2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
69 #include <net/inet_common.h>
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92 .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
93 .__tcp_lhash_users = ATOMIC_INIT(0),
95 = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96 .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108 __u32 faddr, __u16 fport)
110 int h = (laddr ^ lport) ^ (faddr ^ fport);
113 return h & (tcp_ehash_size - 1);
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
118 struct inet_sock *inet = inet_sk(sk);
119 __u32 laddr = inet->rcv_saddr;
120 __u16 lport = inet->num;
121 __u32 faddr = inet->daddr;
122 __u16 fport = inet->dport;
124 return tcp_hashfn(laddr, lport, faddr, fport);
127 /* Allocate and initialize a new TCP local port bind bucket.
128 * The bindhash mutex for snum's hash chain must be held here.
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
138 INIT_HLIST_HEAD(&tb->owners);
139 hlist_add_head(&tb->node, &head->chain);
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
147 if (hlist_empty(&tb->owners)) {
148 __hlist_del(&tb->node);
149 kmem_cache_free(tcp_bucket_cachep, tb);
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
156 struct tcp_bind_hashbucket *head =
157 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158 struct tcp_bind_bucket *tb;
160 spin_lock(&head->lock);
161 tb = tcp_sk(sk)->bind_hash;
162 sk_add_bind_node(child, &tb->owners);
163 tcp_sk(child)->bind_hash = tb;
164 spin_unlock(&head->lock);
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
170 __tcp_inherit_port(sk, child);
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
177 inet_sk(sk)->num = snum;
178 sk_add_bind_node(sk, &tb->owners);
179 tcp_sk(sk)->bind_hash = tb;
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
186 struct hlist_node *node;
187 int reuse = sk->sk_reuse;
189 sk_for_each_bound(sk2, node, &tb->owners) {
191 !tcp_v6_ipv6only(sk2) &&
192 (!sk->sk_bound_dev_if ||
193 !sk2->sk_bound_dev_if ||
194 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195 if (!reuse || !sk2->sk_reuse ||
196 sk2->sk_state == TCP_LISTEN) {
197 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199 sk2_rcv_saddr == sk_rcv_saddr)
207 /* Obtain a reference to a local port for the given sock,
208 * if snum is zero it means select any available local port.
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 struct tcp_bind_hashbucket *head;
213 struct hlist_node *node;
214 struct tcp_bind_bucket *tb;
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
224 spin_lock(&tcp_portalloc_lock);
225 if (tcp_port_rover < low)
228 rover = tcp_port_rover;
233 head = &tcp_bhash[tcp_bhashfn(rover)];
234 spin_lock(&head->lock);
235 tb_for_each(tb, node, &head->chain)
236 if (tb->port == rover)
240 spin_unlock(&head->lock);
241 } while (--remaining > 0);
242 tcp_port_rover = rover;
243 spin_unlock(&tcp_portalloc_lock);
245 /* Exhausted local port range during search? It is not
246 * possible for us to be holding one of the bind hash
247 * locks if this test triggers, because if 'remaining'
248 * drops to zero, we broke out of the do/while loop at
249 * the top level, not from the 'break;' statement.
252 if (unlikely(remaining <= 0))
255 /* OK, here is the one we will use. HEAD is
256 * non-NULL and we hold it's mutex.
260 head = &tcp_bhash[tcp_bhashfn(snum)];
261 spin_lock(&head->lock);
262 tb_for_each(tb, node, &head->chain)
263 if (tb->port == snum)
269 if (!hlist_empty(&tb->owners)) {
270 if (sk->sk_reuse > 1)
272 if (tb->fastreuse > 0 &&
273 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
277 if (tcp_bind_conflict(sk, tb))
283 if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
285 if (hlist_empty(&tb->owners)) {
286 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
290 } else if (tb->fastreuse &&
291 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
294 if (!tcp_sk(sk)->bind_hash)
295 tcp_bind_hash(sk, tb, snum);
296 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
300 spin_unlock(&head->lock);
306 /* Get rid of any references to a local port held by the
309 static void __tcp_put_port(struct sock *sk)
311 struct inet_sock *inet = inet_sk(sk);
312 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
313 struct tcp_bind_bucket *tb;
315 spin_lock(&head->lock);
316 tb = tcp_sk(sk)->bind_hash;
317 __sk_del_bind_node(sk);
318 tcp_sk(sk)->bind_hash = NULL;
320 tcp_bucket_destroy(tb);
321 spin_unlock(&head->lock);
324 void tcp_put_port(struct sock *sk)
331 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
332 * Look, when several writers sleep and reader wakes them up, all but one
333 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
334 * this, _but_ remember, it adds useless work on UP machines (wake up each
335 * exclusive lock release). It should be ifdefed really.
338 void tcp_listen_wlock(void)
340 write_lock(&tcp_lhash_lock);
342 if (atomic_read(&tcp_lhash_users)) {
346 prepare_to_wait_exclusive(&tcp_lhash_wait,
347 &wait, TASK_UNINTERRUPTIBLE);
348 if (!atomic_read(&tcp_lhash_users))
350 write_unlock_bh(&tcp_lhash_lock);
352 write_lock_bh(&tcp_lhash_lock);
355 finish_wait(&tcp_lhash_wait, &wait);
359 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
361 struct hlist_head *list;
364 BUG_TRAP(sk_unhashed(sk));
365 if (listen_possible && sk->sk_state == TCP_LISTEN) {
366 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
367 lock = &tcp_lhash_lock;
370 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
371 lock = &tcp_ehash[sk->sk_hashent].lock;
374 __sk_add_node(sk, list);
375 sock_prot_inc_use(sk->sk_prot);
377 if (listen_possible && sk->sk_state == TCP_LISTEN)
378 wake_up(&tcp_lhash_wait);
381 static void tcp_v4_hash(struct sock *sk)
383 if (sk->sk_state != TCP_CLOSE) {
385 __tcp_v4_hash(sk, 1);
390 void tcp_unhash(struct sock *sk)
397 if (sk->sk_state == TCP_LISTEN) {
400 lock = &tcp_lhash_lock;
402 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
404 write_lock_bh(&head->lock);
407 if (__sk_del_node_init(sk))
408 sock_prot_dec_use(sk->sk_prot);
409 write_unlock_bh(lock);
412 if (sk->sk_state == TCP_LISTEN)
413 wake_up(&tcp_lhash_wait);
416 /* Don't inline this cruft. Here are some nice properties to
417 * exploit here. The BSD API does not allow a listening TCP
418 * to specify the remote port nor the remote address for the
419 * connection. So always assume those are both wildcarded
420 * during the search since they can never be otherwise.
422 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
423 unsigned short hnum, int dif)
425 struct sock *result = NULL, *sk;
426 struct hlist_node *node;
430 sk_for_each(sk, node, head) {
431 struct inet_sock *inet = inet_sk(sk);
433 if (inet->num == hnum && !ipv6_only_sock(sk)) {
434 __u32 rcv_saddr = inet->rcv_saddr;
436 score = (sk->sk_family == PF_INET ? 1 : 0);
438 if (rcv_saddr != daddr)
442 if (sk->sk_bound_dev_if) {
443 if (sk->sk_bound_dev_if != dif)
449 if (score > hiscore) {
458 /* Optimize the common listener case. */
459 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
460 unsigned short hnum, int dif)
462 struct sock *sk = NULL;
463 struct hlist_head *head;
465 read_lock(&tcp_lhash_lock);
466 head = &tcp_listening_hash[tcp_lhashfn(hnum)];
467 if (!hlist_empty(head)) {
468 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
470 if (inet->num == hnum && !sk->sk_node.next &&
471 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
472 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
473 !sk->sk_bound_dev_if)
475 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
481 read_unlock(&tcp_lhash_lock);
485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
488 * Local BH must be disabled here.
491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
495 struct tcp_ehash_bucket *head;
496 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
497 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
499 struct hlist_node *node;
500 /* Optimize here for direct hit, only listening connections can
501 * have wildcards anyways.
503 int hash = tcp_hashfn(daddr, hnum, saddr, sport);
504 head = &tcp_ehash[hash];
505 read_lock(&head->lock);
506 sk_for_each(sk, node, &head->chain) {
507 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508 goto hit; /* You sunk my battleship! */
511 /* Must check for a TIME_WAIT'er before going to listener hash. */
512 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
513 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
518 read_unlock(&head->lock);
525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526 u32 daddr, u16 hnum, int dif)
528 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
531 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
534 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
540 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
546 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
548 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
550 return secure_tcp_sequence_number(skb->nh.iph->daddr,
556 /* called with local bh disabled */
557 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
558 struct tcp_tw_bucket **twp)
560 struct inet_sock *inet = inet_sk(sk);
561 u32 daddr = inet->rcv_saddr;
562 u32 saddr = inet->daddr;
563 int dif = sk->sk_bound_dev_if;
564 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
566 int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
567 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
569 struct hlist_node *node;
570 struct tcp_tw_bucket *tw;
572 write_lock(&head->lock);
574 /* Check TIME-WAIT sockets first. */
575 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
576 tw = (struct tcp_tw_bucket *)sk2;
578 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579 struct tcp_sock *tp = tcp_sk(sk);
581 /* With PAWS, it is safe from the viewpoint
582 of data integrity. Even without PAWS it
583 is safe provided sequence spaces do not
584 overlap i.e. at data rates <= 80Mbit/sec.
586 Actually, the idea is close to VJ's one,
587 only timestamp cache is held not per host,
588 but per port pair and TW bucket is used
591 If TW bucket has been already destroyed we
592 fall back to VJ's scheme and use initial
593 timestamp retrieved from peer table.
595 if (tw->tw_ts_recent_stamp &&
596 (!twp || (sysctl_tcp_tw_reuse &&
598 tw->tw_ts_recent_stamp > 1))) {
600 tw->tw_snd_nxt + 65535 + 2) == 0)
602 tp->rx_opt.ts_recent = tw->tw_ts_recent;
603 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
612 /* And established part... */
613 sk_for_each(sk2, node, &head->chain) {
614 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
619 /* Must record num and sport now. Otherwise we will see
620 * in hash table socket with a funny identity. */
622 inet->sport = htons(lport);
623 sk->sk_hashent = hash;
624 BUG_TRAP(sk_unhashed(sk));
625 __sk_add_node(sk, &head->chain);
626 sock_prot_inc_use(sk->sk_prot);
627 write_unlock(&head->lock);
631 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
633 /* Silly. Should hash-dance instead... */
634 tcp_tw_deschedule(tw);
635 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
643 write_unlock(&head->lock);
644 return -EADDRNOTAVAIL;
647 static inline u32 connect_port_offset(const struct sock *sk)
649 const struct inet_sock *inet = inet_sk(sk);
651 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
656 * Bind a port for a connect operation and hash it.
658 static inline int tcp_v4_hash_connect(struct sock *sk)
660 unsigned short snum = inet_sk(sk)->num;
661 struct tcp_bind_hashbucket *head;
662 struct tcp_bind_bucket *tb;
666 int low = sysctl_local_port_range[0];
667 int high = sysctl_local_port_range[1];
668 int range = high - low;
672 u32 offset = hint + connect_port_offset(sk);
673 struct hlist_node *node;
674 struct tcp_tw_bucket *tw = NULL;
677 for (i = 1; i <= range; i++) {
678 port = low + (i + offset) % range;
679 head = &tcp_bhash[tcp_bhashfn(port)];
680 spin_lock(&head->lock);
682 /* Does not bother with rcv_saddr checks,
683 * because the established check is already
686 tb_for_each(tb, node, &head->chain) {
687 if (tb->port == port) {
688 BUG_TRAP(!hlist_empty(&tb->owners));
689 if (tb->fastreuse >= 0)
691 if (!__tcp_v4_check_established(sk,
699 tb = tcp_bucket_create(head, port);
701 spin_unlock(&head->lock);
708 spin_unlock(&head->lock);
712 return -EADDRNOTAVAIL;
717 /* Head lock still held and bh's disabled */
718 tcp_bind_hash(sk, tb, port);
719 if (sk_unhashed(sk)) {
720 inet_sk(sk)->sport = htons(port);
721 __tcp_v4_hash(sk, 0);
723 spin_unlock(&head->lock);
726 tcp_tw_deschedule(tw);
734 head = &tcp_bhash[tcp_bhashfn(snum)];
735 tb = tcp_sk(sk)->bind_hash;
736 spin_lock_bh(&head->lock);
737 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
738 __tcp_v4_hash(sk, 0);
739 spin_unlock_bh(&head->lock);
742 spin_unlock(&head->lock);
743 /* No definite answer... Walk to established hash table */
744 ret = __tcp_v4_check_established(sk, snum, NULL);
751 /* This will initiate an outgoing connection. */
752 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
754 struct inet_sock *inet = inet_sk(sk);
755 struct tcp_sock *tp = tcp_sk(sk);
756 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
762 if (addr_len < sizeof(struct sockaddr_in))
765 if (usin->sin_family != AF_INET)
766 return -EAFNOSUPPORT;
768 nexthop = daddr = usin->sin_addr.s_addr;
769 if (inet->opt && inet->opt->srr) {
772 nexthop = inet->opt->faddr;
775 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
776 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
778 inet->sport, usin->sin_port, sk);
782 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
787 if (!inet->opt || !inet->opt->srr)
791 inet->saddr = rt->rt_src;
792 inet->rcv_saddr = inet->saddr;
794 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
795 /* Reset inherited state */
796 tp->rx_opt.ts_recent = 0;
797 tp->rx_opt.ts_recent_stamp = 0;
801 if (sysctl_tcp_tw_recycle &&
802 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
803 struct inet_peer *peer = rt_get_peer(rt);
805 /* VJ's idea. We save last timestamp seen from
806 * the destination in peer table, when entering state TIME-WAIT
807 * and initialize rx_opt.ts_recent from it, when trying new connection.
810 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
811 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
812 tp->rx_opt.ts_recent = peer->tcp_ts;
816 inet->dport = usin->sin_port;
819 tp->ext_header_len = 0;
821 tp->ext_header_len = inet->opt->optlen;
823 tp->rx_opt.mss_clamp = 536;
825 /* Socket identity is still unknown (sport may be zero).
826 * However we set state to SYN-SENT and not releasing socket
827 * lock select source port, enter ourselves into the hash tables and
828 * complete initialization after this.
830 tcp_set_state(sk, TCP_SYN_SENT);
831 err = tcp_v4_hash_connect(sk);
835 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
839 /* OK, now commit destination to socket. */
840 __sk_dst_set(sk, &rt->u.dst);
841 tcp_v4_setup_caps(sk, &rt->u.dst);
844 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
849 inet->id = tp->write_seq ^ jiffies;
851 err = tcp_connect(sk);
859 /* This unhashes the socket and releases the local port, if necessary. */
860 tcp_set_state(sk, TCP_CLOSE);
862 sk->sk_route_caps = 0;
867 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
869 return ((struct rtable *)skb->dst)->rt_iif;
872 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
874 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
877 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
878 struct request_sock ***prevp,
880 __u32 raddr, __u32 laddr)
882 struct listen_sock *lopt = tp->accept_queue.listen_opt;
883 struct request_sock *req, **prev;
885 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
886 (req = *prev) != NULL;
887 prev = &req->dl_next) {
888 const struct inet_request_sock *ireq = inet_rsk(req);
890 if (ireq->rmt_port == rport &&
891 ireq->rmt_addr == raddr &&
892 ireq->loc_addr == laddr &&
893 TCP_INET_FAMILY(req->rsk_ops->family)) {
903 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
905 struct tcp_sock *tp = tcp_sk(sk);
906 struct listen_sock *lopt = tp->accept_queue.listen_opt;
907 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
909 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
915 * This routine does path mtu discovery as defined in RFC1191.
917 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
920 struct dst_entry *dst;
921 struct inet_sock *inet = inet_sk(sk);
922 struct tcp_sock *tp = tcp_sk(sk);
924 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
925 * send out by Linux are always <576bytes so they should go through
928 if (sk->sk_state == TCP_LISTEN)
931 /* We don't check in the destentry if pmtu discovery is forbidden
932 * on this route. We just assume that no packet_to_big packets
933 * are send back when pmtu discovery is not active.
934 * There is a small race when the user changes this flag in the
935 * route, but I think that's acceptable.
937 if ((dst = __sk_dst_check(sk, 0)) == NULL)
940 dst->ops->update_pmtu(dst, mtu);
942 /* Something is about to be wrong... Remember soft error
943 * for the case, if this connection will not able to recover.
945 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
946 sk->sk_err_soft = EMSGSIZE;
950 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
951 tp->pmtu_cookie > mtu) {
952 tcp_sync_mss(sk, mtu);
954 /* Resend the TCP packet because it's
955 * clear that the old packet has been
956 * dropped. This is the new "fast" path mtu
959 tcp_simple_retransmit(sk);
960 } /* else let the usual retransmit timer handle it */
964 * This routine is called by the ICMP module when it gets some
965 * sort of error condition. If err < 0 then the socket should
966 * be closed and the error returned to the user. If err > 0
967 * it's just the icmp type << 8 | icmp code. After adjustment
968 * header points to the first 8 bytes of the tcp header. We need
969 * to find the appropriate port.
971 * The locking strategy used here is very "optimistic". When
972 * someone else accesses the socket the ICMP is just dropped
973 * and for some paths there is no check at all.
974 * A more general error queue to queue errors for later handling
975 * is probably better.
979 void tcp_v4_err(struct sk_buff *skb, u32 info)
981 struct iphdr *iph = (struct iphdr *)skb->data;
982 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
984 struct inet_sock *inet;
985 int type = skb->h.icmph->type;
986 int code = skb->h.icmph->code;
991 if (skb->len < (iph->ihl << 2) + 8) {
992 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
996 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
997 th->source, tcp_v4_iif(skb));
999 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
1002 if (sk->sk_state == TCP_TIME_WAIT) {
1003 tcp_tw_put((struct tcp_tw_bucket *)sk);
1008 /* If too many ICMPs get dropped on busy
1009 * servers this needs to be solved differently.
1011 if (sock_owned_by_user(sk))
1012 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1014 if (sk->sk_state == TCP_CLOSE)
1018 seq = ntohl(th->seq);
1019 if (sk->sk_state != TCP_LISTEN &&
1020 !between(seq, tp->snd_una, tp->snd_nxt)) {
1021 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1026 case ICMP_SOURCE_QUENCH:
1027 /* Just silently ignore these. */
1029 case ICMP_PARAMETERPROB:
1032 case ICMP_DEST_UNREACH:
1033 if (code > NR_ICMP_UNREACH)
1036 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1037 if (!sock_owned_by_user(sk))
1038 do_pmtu_discovery(sk, iph, info);
1042 err = icmp_err_convert[code].errno;
1044 case ICMP_TIME_EXCEEDED:
1051 switch (sk->sk_state) {
1052 struct request_sock *req, **prev;
1054 if (sock_owned_by_user(sk))
1057 req = tcp_v4_search_req(tp, &prev, th->dest,
1058 iph->daddr, iph->saddr);
1062 /* ICMPs are not backlogged, hence we cannot get
1063 an established socket here.
1067 if (seq != tcp_rsk(req)->snt_isn) {
1068 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1073 * Still in SYN_RECV, just remove it silently.
1074 * There is no good way to pass the error to the newly
1075 * created socket, and POSIX does not want network
1076 * errors returned from accept().
1078 tcp_synq_drop(sk, req, prev);
1082 case TCP_SYN_RECV: /* Cannot happen.
1083 It can f.e. if SYNs crossed.
1085 if (!sock_owned_by_user(sk)) {
1086 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1089 sk->sk_error_report(sk);
1093 sk->sk_err_soft = err;
1098 /* If we've already connected we will keep trying
1099 * until we time out, or the user gives up.
1101 * rfc1122 4.2.3.9 allows to consider as hard errors
1102 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1103 * but it is obsoleted by pmtu discovery).
1105 * Note, that in modern internet, where routing is unreliable
1106 * and in each dark corner broken firewalls sit, sending random
1107 * errors ordered by their masters even this two messages finally lose
1108 * their original sense (even Linux sends invalid PORT_UNREACHs)
1110 * Now we are in compliance with RFCs.
1115 if (!sock_owned_by_user(sk) && inet->recverr) {
1117 sk->sk_error_report(sk);
1118 } else { /* Only an error on timeout */
1119 sk->sk_err_soft = err;
1127 /* This routine computes an IPv4 TCP checksum. */
1128 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1129 struct sk_buff *skb)
1131 struct inet_sock *inet = inet_sk(sk);
1133 if (skb->ip_summed == CHECKSUM_HW) {
1134 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1135 skb->csum = offsetof(struct tcphdr, check);
1137 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1138 csum_partial((char *)th,
1145 * This routine will send an RST to the other tcp.
1147 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1149 * Answer: if a packet caused RST, it is not for a socket
1150 * existing in our system, if it is matched to a socket,
1151 * it is just duplicate segment or bug in other side's TCP.
1152 * So that we build reply only basing on parameters
1153 * arrived with segment.
1154 * Exception: precedence violation. We do not implement it in any case.
1157 static void tcp_v4_send_reset(struct sk_buff *skb)
1159 struct tcphdr *th = skb->h.th;
1161 struct ip_reply_arg arg;
1163 /* Never send a reset in response to a reset. */
1167 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1170 /* Swap the send and the receive. */
1171 memset(&rth, 0, sizeof(struct tcphdr));
1172 rth.dest = th->source;
1173 rth.source = th->dest;
1174 rth.doff = sizeof(struct tcphdr) / 4;
1178 rth.seq = th->ack_seq;
1181 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1182 skb->len - (th->doff << 2));
1185 memset(&arg, 0, sizeof arg);
1186 arg.iov[0].iov_base = (unsigned char *)&rth;
1187 arg.iov[0].iov_len = sizeof rth;
1188 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1189 skb->nh.iph->saddr, /*XXX*/
1190 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1191 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1193 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1195 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1196 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1199 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1200 outside socket context is ugly, certainly. What can I do?
1203 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1206 struct tcphdr *th = skb->h.th;
1211 struct ip_reply_arg arg;
1213 memset(&rep.th, 0, sizeof(struct tcphdr));
1214 memset(&arg, 0, sizeof arg);
1216 arg.iov[0].iov_base = (unsigned char *)&rep;
1217 arg.iov[0].iov_len = sizeof(rep.th);
1219 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1220 (TCPOPT_TIMESTAMP << 8) |
1222 rep.tsopt[1] = htonl(tcp_time_stamp);
1223 rep.tsopt[2] = htonl(ts);
1224 arg.iov[0].iov_len = sizeof(rep);
1227 /* Swap the send and the receive. */
1228 rep.th.dest = th->source;
1229 rep.th.source = th->dest;
1230 rep.th.doff = arg.iov[0].iov_len / 4;
1231 rep.th.seq = htonl(seq);
1232 rep.th.ack_seq = htonl(ack);
1234 rep.th.window = htons(win);
1236 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1237 skb->nh.iph->saddr, /*XXX*/
1238 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1239 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1241 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1243 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1246 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1248 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1250 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1251 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1256 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1258 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1262 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1263 struct request_sock *req)
1266 const struct inet_request_sock *ireq = inet_rsk(req);
1267 struct ip_options *opt = inet_rsk(req)->opt;
1268 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1270 { .daddr = ((opt && opt->srr) ?
1273 .saddr = ireq->loc_addr,
1274 .tos = RT_CONN_FLAGS(sk) } },
1275 .proto = IPPROTO_TCP,
1277 { .sport = inet_sk(sk)->sport,
1278 .dport = ireq->rmt_port } } };
1280 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1281 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1284 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1286 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1293 * Send a SYN-ACK after having received an ACK.
1294 * This still operates on a request_sock only, not on a big
1297 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1298 struct dst_entry *dst)
1300 const struct inet_request_sock *ireq = inet_rsk(req);
1302 struct sk_buff * skb;
1304 /* First, grab a route. */
1305 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1308 skb = tcp_make_synack(sk, dst, req);
1311 struct tcphdr *th = skb->h.th;
1313 th->check = tcp_v4_check(th, skb->len,
1316 csum_partial((char *)th, skb->len,
1319 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1322 if (err == NET_XMIT_CN)
1332 * IPv4 request_sock destructor.
1334 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1336 if (inet_rsk(req)->opt)
1337 kfree(inet_rsk(req)->opt);
1340 static inline void syn_flood_warning(struct sk_buff *skb)
1342 static unsigned long warntime;
1344 if (time_after(jiffies, (warntime + HZ * 60))) {
1347 "possible SYN flooding on port %d. Sending cookies.\n",
1348 ntohs(skb->h.th->dest));
1353 * Save and compile IPv4 options into the request_sock if needed.
1355 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1356 struct sk_buff *skb)
1358 struct ip_options *opt = &(IPCB(skb)->opt);
1359 struct ip_options *dopt = NULL;
1361 if (opt && opt->optlen) {
1362 int opt_size = optlength(opt);
1363 dopt = kmalloc(opt_size, GFP_ATOMIC);
1365 if (ip_options_echo(dopt, skb)) {
1374 struct request_sock_ops tcp_request_sock_ops = {
1376 .obj_size = sizeof(struct tcp_request_sock),
1377 .rtx_syn_ack = tcp_v4_send_synack,
1378 .send_ack = tcp_v4_reqsk_send_ack,
1379 .destructor = tcp_v4_reqsk_destructor,
1380 .send_reset = tcp_v4_send_reset,
1383 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1385 struct inet_request_sock *ireq;
1386 struct tcp_options_received tmp_opt;
1387 struct request_sock *req;
1388 __u32 saddr = skb->nh.iph->saddr;
1389 __u32 daddr = skb->nh.iph->daddr;
1390 __u32 isn = TCP_SKB_CB(skb)->when;
1391 struct dst_entry *dst = NULL;
1392 #ifdef CONFIG_SYN_COOKIES
1393 int want_cookie = 0;
1395 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1398 /* Never answer to SYNs send to broadcast or multicast */
1399 if (((struct rtable *)skb->dst)->rt_flags &
1400 (RTCF_BROADCAST | RTCF_MULTICAST))
1403 /* TW buckets are converted to open requests without
1404 * limitations, they conserve resources and peer is
1405 * evidently real one.
1407 if (tcp_synq_is_full(sk) && !isn) {
1408 #ifdef CONFIG_SYN_COOKIES
1409 if (sysctl_tcp_syncookies) {
1416 /* Accept backlog is full. If we have already queued enough
1417 * of warm entries in syn queue, drop request. It is better than
1418 * clogging syn queue with openreqs with exponentially increasing
1421 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1424 req = reqsk_alloc(&tcp_request_sock_ops);
1428 tcp_clear_options(&tmp_opt);
1429 tmp_opt.mss_clamp = 536;
1430 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1432 tcp_parse_options(skb, &tmp_opt, 0);
1435 tcp_clear_options(&tmp_opt);
1436 tmp_opt.saw_tstamp = 0;
1439 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1440 /* Some OSes (unknown ones, but I see them on web server, which
1441 * contains information interesting only for windows'
1442 * users) do not send their stamp in SYN. It is easy case.
1443 * We simply do not advertise TS support.
1445 tmp_opt.saw_tstamp = 0;
1446 tmp_opt.tstamp_ok = 0;
1448 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1450 tcp_openreq_init(req, &tmp_opt, skb);
1452 ireq = inet_rsk(req);
1453 ireq->loc_addr = daddr;
1454 ireq->rmt_addr = saddr;
1455 ireq->opt = tcp_v4_save_options(sk, skb);
1457 TCP_ECN_create_request(req, skb->h.th);
1460 #ifdef CONFIG_SYN_COOKIES
1461 syn_flood_warning(skb);
1463 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1465 struct inet_peer *peer = NULL;
1467 /* VJ's idea. We save last timestamp seen
1468 * from the destination in peer table, when entering
1469 * state TIME-WAIT, and check against it before
1470 * accepting new connection request.
1472 * If "isn" is not zero, this request hit alive
1473 * timewait bucket, so that all the necessary checks
1474 * are made in the function processing timewait state.
1476 if (tmp_opt.saw_tstamp &&
1477 sysctl_tcp_tw_recycle &&
1478 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1479 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1480 peer->v4daddr == saddr) {
1481 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1482 (s32)(peer->tcp_ts - req->ts_recent) >
1484 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1489 /* Kill the following clause, if you dislike this way. */
1490 else if (!sysctl_tcp_syncookies &&
1491 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1492 (sysctl_max_syn_backlog >> 2)) &&
1493 (!peer || !peer->tcp_ts_stamp) &&
1494 (!dst || !dst_metric(dst, RTAX_RTT))) {
1495 /* Without syncookies last quarter of
1496 * backlog is filled with destinations,
1497 * proven to be alive.
1498 * It means that we continue to communicate
1499 * to destinations, already remembered
1500 * to the moment of synflood.
1502 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1503 "request from %u.%u."
1506 ntohs(skb->h.th->source)));
1511 isn = tcp_v4_init_sequence(sk, skb);
1513 tcp_rsk(req)->snt_isn = isn;
1515 if (tcp_v4_send_synack(sk, req, dst))
1521 tcp_v4_synq_add(sk, req);
1528 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1534 * The three way handshake has completed - we got a valid synack -
1535 * now create the new socket.
1537 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1538 struct request_sock *req,
1539 struct dst_entry *dst)
1541 struct inet_request_sock *ireq;
1542 struct inet_sock *newinet;
1543 struct tcp_sock *newtp;
1546 if (sk_acceptq_is_full(sk))
1549 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1552 newsk = tcp_create_openreq_child(sk, req, skb);
1556 newsk->sk_dst_cache = dst;
1557 tcp_v4_setup_caps(newsk, dst);
1559 newtp = tcp_sk(newsk);
1560 newinet = inet_sk(newsk);
1561 ireq = inet_rsk(req);
1562 newinet->daddr = ireq->rmt_addr;
1563 newinet->rcv_saddr = ireq->loc_addr;
1564 newinet->saddr = ireq->loc_addr;
1565 newinet->opt = ireq->opt;
1567 newinet->mc_index = tcp_v4_iif(skb);
1568 newinet->mc_ttl = skb->nh.iph->ttl;
1569 newtp->ext_header_len = 0;
1571 newtp->ext_header_len = newinet->opt->optlen;
1572 newinet->id = newtp->write_seq ^ jiffies;
1574 tcp_sync_mss(newsk, dst_mtu(dst));
1575 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1576 tcp_initialize_rcv_mss(newsk);
1578 __tcp_v4_hash(newsk, 0);
1579 __tcp_inherit_port(sk, newsk);
1584 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1586 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1591 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1593 struct tcphdr *th = skb->h.th;
1594 struct iphdr *iph = skb->nh.iph;
1595 struct tcp_sock *tp = tcp_sk(sk);
1597 struct request_sock **prev;
1598 /* Find possible connection requests. */
1599 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1600 iph->saddr, iph->daddr);
1602 return tcp_check_req(sk, skb, req, prev);
1604 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1611 if (nsk->sk_state != TCP_TIME_WAIT) {
1615 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1619 #ifdef CONFIG_SYN_COOKIES
1620 if (!th->rst && !th->syn && th->ack)
1621 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1626 static int tcp_v4_checksum_init(struct sk_buff *skb)
1628 if (skb->ip_summed == CHECKSUM_HW) {
1629 skb->ip_summed = CHECKSUM_UNNECESSARY;
1630 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1631 skb->nh.iph->daddr, skb->csum))
1634 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1635 skb->ip_summed = CHECKSUM_NONE;
1637 if (skb->len <= 76) {
1638 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1640 skb_checksum(skb, 0, skb->len, 0)))
1642 skb->ip_summed = CHECKSUM_UNNECESSARY;
1644 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1646 skb->nh.iph->daddr, 0);
1652 /* The socket must have it's spinlock held when we get
1655 * We have a potential double-lock case here, so even when
1656 * doing backlog processing we use the BH locking scheme.
1657 * This is because we cannot sleep with the original spinlock
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1662 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1663 TCP_CHECK_TIMER(sk);
1664 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1666 TCP_CHECK_TIMER(sk);
1670 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1673 if (sk->sk_state == TCP_LISTEN) {
1674 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1679 if (tcp_child_process(sk, nsk, skb))
1685 TCP_CHECK_TIMER(sk);
1686 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1688 TCP_CHECK_TIMER(sk);
1692 tcp_v4_send_reset(skb);
1695 /* Be careful here. If this function gets more complicated and
1696 * gcc suffers from register pressure on the x86, sk (in %ebx)
1697 * might be destroyed here. This current version compiles correctly,
1698 * but you have been warned.
1703 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1711 int tcp_v4_rcv(struct sk_buff *skb)
1717 if (skb->pkt_type != PACKET_HOST)
1720 /* Count it even if it's bad */
1721 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1723 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1728 if (th->doff < sizeof(struct tcphdr) / 4)
1730 if (!pskb_may_pull(skb, th->doff * 4))
1733 /* An explanation is required here, I think.
1734 * Packet length and doff are validated by header prediction,
1735 * provided case of th->doff==0 is elimineted.
1736 * So, we defer the checks. */
1737 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1738 tcp_v4_checksum_init(skb) < 0))
1742 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1743 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1744 skb->len - th->doff * 4);
1745 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1746 TCP_SKB_CB(skb)->when = 0;
1747 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1748 TCP_SKB_CB(skb)->sacked = 0;
1750 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1751 skb->nh.iph->daddr, ntohs(th->dest),
1758 if (sk->sk_state == TCP_TIME_WAIT)
1761 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1762 goto discard_and_relse;
1764 if (sk_filter(sk, skb, 0))
1765 goto discard_and_relse;
1771 if (!sock_owned_by_user(sk)) {
1772 if (!tcp_prequeue(sk, skb))
1773 ret = tcp_v4_do_rcv(sk, skb);
1775 sk_add_backlog(sk, skb);
1783 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1786 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1788 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1790 tcp_v4_send_reset(skb);
1794 /* Discard frame. */
1803 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1804 tcp_tw_put((struct tcp_tw_bucket *) sk);
1808 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1809 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1810 tcp_tw_put((struct tcp_tw_bucket *) sk);
1813 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1814 skb, th, skb->len)) {
1816 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1820 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1821 tcp_tw_put((struct tcp_tw_bucket *)sk);
1825 /* Fall through to ACK */
1828 tcp_v4_timewait_ack(sk, skb);
1832 case TCP_TW_SUCCESS:;
1837 /* With per-bucket locks this operation is not-atomic, so that
1838 * this version is not worse.
1840 static void __tcp_v4_rehash(struct sock *sk)
1842 sk->sk_prot->unhash(sk);
1843 sk->sk_prot->hash(sk);
1846 static int tcp_v4_reselect_saddr(struct sock *sk)
1848 struct inet_sock *inet = inet_sk(sk);
1851 __u32 old_saddr = inet->saddr;
1853 __u32 daddr = inet->daddr;
1855 if (inet->opt && inet->opt->srr)
1856 daddr = inet->opt->faddr;
1858 /* Query new route. */
1859 err = ip_route_connect(&rt, daddr, 0,
1861 sk->sk_bound_dev_if,
1863 inet->sport, inet->dport, sk);
1867 __sk_dst_set(sk, &rt->u.dst);
1868 tcp_v4_setup_caps(sk, &rt->u.dst);
1870 new_saddr = rt->rt_src;
1872 if (new_saddr == old_saddr)
1875 if (sysctl_ip_dynaddr > 1) {
1876 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
1877 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
1879 NIPQUAD(new_saddr));
1882 inet->saddr = new_saddr;
1883 inet->rcv_saddr = new_saddr;
1885 /* XXX The only one ugly spot where we need to
1886 * XXX really change the sockets identity after
1887 * XXX it has entered the hashes. -DaveM
1889 * Besides that, it does not check for connection
1890 * uniqueness. Wait for troubles.
1892 __tcp_v4_rehash(sk);
1896 int tcp_v4_rebuild_header(struct sock *sk)
1898 struct inet_sock *inet = inet_sk(sk);
1899 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1903 /* Route is OK, nothing to do. */
1908 daddr = inet->daddr;
1909 if (inet->opt && inet->opt->srr)
1910 daddr = inet->opt->faddr;
1913 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1916 .saddr = inet->saddr,
1917 .tos = RT_CONN_FLAGS(sk) } },
1918 .proto = IPPROTO_TCP,
1920 { .sport = inet->sport,
1921 .dport = inet->dport } } };
1923 err = ip_route_output_flow(&rt, &fl, sk, 0);
1926 __sk_dst_set(sk, &rt->u.dst);
1927 tcp_v4_setup_caps(sk, &rt->u.dst);
1931 /* Routing failed... */
1932 sk->sk_route_caps = 0;
1934 if (!sysctl_ip_dynaddr ||
1935 sk->sk_state != TCP_SYN_SENT ||
1936 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
1937 (err = tcp_v4_reselect_saddr(sk)) != 0)
1938 sk->sk_err_soft = -err;
1943 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1945 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1946 struct inet_sock *inet = inet_sk(sk);
1948 sin->sin_family = AF_INET;
1949 sin->sin_addr.s_addr = inet->daddr;
1950 sin->sin_port = inet->dport;
1953 /* VJ's idea. Save last timestamp seen from this destination
1954 * and hold it at least for normal timewait interval to use for duplicate
1955 * segment detection in subsequent connections, before they enter synchronized
1959 int tcp_v4_remember_stamp(struct sock *sk)
1961 struct inet_sock *inet = inet_sk(sk);
1962 struct tcp_sock *tp = tcp_sk(sk);
1963 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1964 struct inet_peer *peer = NULL;
1967 if (!rt || rt->rt_dst != inet->daddr) {
1968 peer = inet_getpeer(inet->daddr, 1);
1972 rt_bind_peer(rt, 1);
1977 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1978 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1979 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1980 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1981 peer->tcp_ts = tp->rx_opt.ts_recent;
1991 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1993 struct inet_peer *peer = NULL;
1995 peer = inet_getpeer(tw->tw_daddr, 1);
1998 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1999 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
2000 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
2001 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
2002 peer->tcp_ts = tw->tw_ts_recent;
2011 struct tcp_func ipv4_specific = {
2012 .queue_xmit = ip_queue_xmit,
2013 .send_check = tcp_v4_send_check,
2014 .rebuild_header = tcp_v4_rebuild_header,
2015 .conn_request = tcp_v4_conn_request,
2016 .syn_recv_sock = tcp_v4_syn_recv_sock,
2017 .remember_stamp = tcp_v4_remember_stamp,
2018 .net_header_len = sizeof(struct iphdr),
2019 .setsockopt = ip_setsockopt,
2020 .getsockopt = ip_getsockopt,
2021 .addr2sockaddr = v4_addr2sockaddr,
2022 .sockaddr_len = sizeof(struct sockaddr_in),
2025 /* NOTE: A lot of things set to zero explicitly by call to
2026 * sk_alloc() so need not be done here.
2028 static int tcp_v4_init_sock(struct sock *sk)
2030 struct tcp_sock *tp = tcp_sk(sk);
2032 skb_queue_head_init(&tp->out_of_order_queue);
2033 tcp_init_xmit_timers(sk);
2034 tcp_prequeue_init(tp);
2036 tp->rto = TCP_TIMEOUT_INIT;
2037 tp->mdev = TCP_TIMEOUT_INIT;
2039 /* So many TCP implementations out there (incorrectly) count the
2040 * initial SYN frame in their delayed-ACK and congestion control
2041 * algorithms that we must have the following bandaid to talk
2042 * efficiently to them. -DaveM
2046 /* See draft-stevens-tcpca-spec-01 for discussion of the
2047 * initialization of these values.
2049 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2050 tp->snd_cwnd_clamp = ~0;
2051 tp->mss_cache = 536;
2053 tp->reordering = sysctl_tcp_reordering;
2054 tp->ca_ops = &tcp_init_congestion_ops;
2056 sk->sk_state = TCP_CLOSE;
2058 sk->sk_write_space = sk_stream_write_space;
2059 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2061 tp->af_specific = &ipv4_specific;
2063 sk->sk_sndbuf = sysctl_tcp_wmem[1];
2064 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2066 atomic_inc(&tcp_sockets_allocated);
2071 int tcp_v4_destroy_sock(struct sock *sk)
2073 struct tcp_sock *tp = tcp_sk(sk);
2075 tcp_clear_xmit_timers(sk);
2077 tcp_cleanup_congestion_control(tp);
2079 /* Cleanup up the write buffer. */
2080 sk_stream_writequeue_purge(sk);
2082 /* Cleans up our, hopefully empty, out_of_order_queue. */
2083 __skb_queue_purge(&tp->out_of_order_queue);
2085 /* Clean prequeue, it must be empty really */
2086 __skb_queue_purge(&tp->ucopy.prequeue);
2088 /* Clean up a referenced TCP bind bucket. */
2093 * If sendmsg cached page exists, toss it.
2095 if (sk->sk_sndmsg_page) {
2096 __free_page(sk->sk_sndmsg_page);
2097 sk->sk_sndmsg_page = NULL;
2100 atomic_dec(&tcp_sockets_allocated);
2105 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2107 #ifdef CONFIG_PROC_FS
2108 /* Proc filesystem TCP sock list dumping. */
2110 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2112 return hlist_empty(head) ? NULL :
2113 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2116 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2118 return tw->tw_node.next ?
2119 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2122 static void *listening_get_next(struct seq_file *seq, void *cur)
2124 struct tcp_sock *tp;
2125 struct hlist_node *node;
2126 struct sock *sk = cur;
2127 struct tcp_iter_state* st = seq->private;
2131 sk = sk_head(&tcp_listening_hash[0]);
2137 if (st->state == TCP_SEQ_STATE_OPENREQ) {
2138 struct request_sock *req = cur;
2140 tp = tcp_sk(st->syn_wait_sk);
2144 if (req->rsk_ops->family == st->family) {
2150 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2153 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2155 sk = sk_next(st->syn_wait_sk);
2156 st->state = TCP_SEQ_STATE_LISTENING;
2157 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2160 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2161 if (reqsk_queue_len(&tp->accept_queue))
2163 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2167 sk_for_each_from(sk, node) {
2168 if (sk->sk_family == st->family) {
2173 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2174 if (reqsk_queue_len(&tp->accept_queue)) {
2176 st->uid = sock_i_uid(sk);
2177 st->syn_wait_sk = sk;
2178 st->state = TCP_SEQ_STATE_OPENREQ;
2182 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2184 if (++st->bucket < TCP_LHTABLE_SIZE) {
2185 sk = sk_head(&tcp_listening_hash[st->bucket]);
2193 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2195 void *rc = listening_get_next(seq, NULL);
2197 while (rc && *pos) {
2198 rc = listening_get_next(seq, rc);
2204 static void *established_get_first(struct seq_file *seq)
2206 struct tcp_iter_state* st = seq->private;
2209 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2211 struct hlist_node *node;
2212 struct tcp_tw_bucket *tw;
2214 /* We can reschedule _before_ having picked the target: */
2215 cond_resched_softirq();
2217 read_lock(&tcp_ehash[st->bucket].lock);
2218 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2219 if (sk->sk_family != st->family) {
2225 st->state = TCP_SEQ_STATE_TIME_WAIT;
2226 tw_for_each(tw, node,
2227 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2228 if (tw->tw_family != st->family) {
2234 read_unlock(&tcp_ehash[st->bucket].lock);
2235 st->state = TCP_SEQ_STATE_ESTABLISHED;
2241 static void *established_get_next(struct seq_file *seq, void *cur)
2243 struct sock *sk = cur;
2244 struct tcp_tw_bucket *tw;
2245 struct hlist_node *node;
2246 struct tcp_iter_state* st = seq->private;
2250 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2254 while (tw && tw->tw_family != st->family) {
2261 read_unlock(&tcp_ehash[st->bucket].lock);
2262 st->state = TCP_SEQ_STATE_ESTABLISHED;
2264 /* We can reschedule between buckets: */
2265 cond_resched_softirq();
2267 if (++st->bucket < tcp_ehash_size) {
2268 read_lock(&tcp_ehash[st->bucket].lock);
2269 sk = sk_head(&tcp_ehash[st->bucket].chain);
2277 sk_for_each_from(sk, node) {
2278 if (sk->sk_family == st->family)
2282 st->state = TCP_SEQ_STATE_TIME_WAIT;
2283 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2291 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2293 void *rc = established_get_first(seq);
2296 rc = established_get_next(seq, rc);
2302 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2305 struct tcp_iter_state* st = seq->private;
2308 st->state = TCP_SEQ_STATE_LISTENING;
2309 rc = listening_get_idx(seq, &pos);
2312 tcp_listen_unlock();
2314 st->state = TCP_SEQ_STATE_ESTABLISHED;
2315 rc = established_get_idx(seq, pos);
2321 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2323 struct tcp_iter_state* st = seq->private;
2324 st->state = TCP_SEQ_STATE_LISTENING;
2326 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2329 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2332 struct tcp_iter_state* st;
2334 if (v == SEQ_START_TOKEN) {
2335 rc = tcp_get_idx(seq, 0);
2340 switch (st->state) {
2341 case TCP_SEQ_STATE_OPENREQ:
2342 case TCP_SEQ_STATE_LISTENING:
2343 rc = listening_get_next(seq, v);
2345 tcp_listen_unlock();
2347 st->state = TCP_SEQ_STATE_ESTABLISHED;
2348 rc = established_get_first(seq);
2351 case TCP_SEQ_STATE_ESTABLISHED:
2352 case TCP_SEQ_STATE_TIME_WAIT:
2353 rc = established_get_next(seq, v);
2361 static void tcp_seq_stop(struct seq_file *seq, void *v)
2363 struct tcp_iter_state* st = seq->private;
2365 switch (st->state) {
2366 case TCP_SEQ_STATE_OPENREQ:
2368 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2369 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2371 case TCP_SEQ_STATE_LISTENING:
2372 if (v != SEQ_START_TOKEN)
2373 tcp_listen_unlock();
2375 case TCP_SEQ_STATE_TIME_WAIT:
2376 case TCP_SEQ_STATE_ESTABLISHED:
2378 read_unlock(&tcp_ehash[st->bucket].lock);
2384 static int tcp_seq_open(struct inode *inode, struct file *file)
2386 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2387 struct seq_file *seq;
2388 struct tcp_iter_state *s;
2391 if (unlikely(afinfo == NULL))
2394 s = kmalloc(sizeof(*s), GFP_KERNEL);
2397 memset(s, 0, sizeof(*s));
2398 s->family = afinfo->family;
2399 s->seq_ops.start = tcp_seq_start;
2400 s->seq_ops.next = tcp_seq_next;
2401 s->seq_ops.show = afinfo->seq_show;
2402 s->seq_ops.stop = tcp_seq_stop;
2404 rc = seq_open(file, &s->seq_ops);
2407 seq = file->private_data;
2416 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2419 struct proc_dir_entry *p;
2423 afinfo->seq_fops->owner = afinfo->owner;
2424 afinfo->seq_fops->open = tcp_seq_open;
2425 afinfo->seq_fops->read = seq_read;
2426 afinfo->seq_fops->llseek = seq_lseek;
2427 afinfo->seq_fops->release = seq_release_private;
2429 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2437 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2441 proc_net_remove(afinfo->name);
2442 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2445 static void get_openreq4(struct sock *sk, struct request_sock *req,
2446 char *tmpbuf, int i, int uid)
2448 const struct inet_request_sock *ireq = inet_rsk(req);
2449 int ttd = req->expires - jiffies;
2451 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2452 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2455 ntohs(inet_sk(sk)->sport),
2457 ntohs(ireq->rmt_port),
2459 0, 0, /* could print option size, but that is af dependent. */
2460 1, /* timers active (only the expire timer) */
2461 jiffies_to_clock_t(ttd),
2464 0, /* non standard timer */
2465 0, /* open_requests have no inode */
2466 atomic_read(&sk->sk_refcnt),
2470 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2473 unsigned long timer_expires;
2474 struct tcp_sock *tp = tcp_sk(sp);
2475 struct inet_sock *inet = inet_sk(sp);
2476 unsigned int dest = inet->daddr;
2477 unsigned int src = inet->rcv_saddr;
2478 __u16 destp = ntohs(inet->dport);
2479 __u16 srcp = ntohs(inet->sport);
2481 if (tp->pending == TCP_TIME_RETRANS) {
2483 timer_expires = tp->timeout;
2484 } else if (tp->pending == TCP_TIME_PROBE0) {
2486 timer_expires = tp->timeout;
2487 } else if (timer_pending(&sp->sk_timer)) {
2489 timer_expires = sp->sk_timer.expires;
2492 timer_expires = jiffies;
2495 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2496 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2497 i, src, srcp, dest, destp, sp->sk_state,
2498 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2500 jiffies_to_clock_t(timer_expires - jiffies),
2505 atomic_read(&sp->sk_refcnt), sp,
2506 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2508 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2511 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2513 unsigned int dest, src;
2515 int ttd = tw->tw_ttd - jiffies;
2520 dest = tw->tw_daddr;
2521 src = tw->tw_rcv_saddr;
2522 destp = ntohs(tw->tw_dport);
2523 srcp = ntohs(tw->tw_sport);
2525 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2526 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2527 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2528 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2529 atomic_read(&tw->tw_refcnt), tw);
2534 static int tcp4_seq_show(struct seq_file *seq, void *v)
2536 struct tcp_iter_state* st;
2537 char tmpbuf[TMPSZ + 1];
2539 if (v == SEQ_START_TOKEN) {
2540 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2541 " sl local_address rem_address st tx_queue "
2542 "rx_queue tr tm->when retrnsmt uid timeout "
2548 switch (st->state) {
2549 case TCP_SEQ_STATE_LISTENING:
2550 case TCP_SEQ_STATE_ESTABLISHED:
2551 get_tcp4_sock(v, tmpbuf, st->num);
2553 case TCP_SEQ_STATE_OPENREQ:
2554 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2556 case TCP_SEQ_STATE_TIME_WAIT:
2557 get_timewait4_sock(v, tmpbuf, st->num);
2560 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2565 static struct file_operations tcp4_seq_fops;
2566 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2567 .owner = THIS_MODULE,
2570 .seq_show = tcp4_seq_show,
2571 .seq_fops = &tcp4_seq_fops,
2574 int __init tcp4_proc_init(void)
2576 return tcp_proc_register(&tcp4_seq_afinfo);
2579 void tcp4_proc_exit(void)
2581 tcp_proc_unregister(&tcp4_seq_afinfo);
2583 #endif /* CONFIG_PROC_FS */
2585 struct proto tcp_prot = {
2587 .owner = THIS_MODULE,
2589 .connect = tcp_v4_connect,
2590 .disconnect = tcp_disconnect,
2591 .accept = tcp_accept,
2593 .init = tcp_v4_init_sock,
2594 .destroy = tcp_v4_destroy_sock,
2595 .shutdown = tcp_shutdown,
2596 .setsockopt = tcp_setsockopt,
2597 .getsockopt = tcp_getsockopt,
2598 .sendmsg = tcp_sendmsg,
2599 .recvmsg = tcp_recvmsg,
2600 .backlog_rcv = tcp_v4_do_rcv,
2601 .hash = tcp_v4_hash,
2602 .unhash = tcp_unhash,
2603 .get_port = tcp_v4_get_port,
2604 .enter_memory_pressure = tcp_enter_memory_pressure,
2605 .sockets_allocated = &tcp_sockets_allocated,
2606 .memory_allocated = &tcp_memory_allocated,
2607 .memory_pressure = &tcp_memory_pressure,
2608 .sysctl_mem = sysctl_tcp_mem,
2609 .sysctl_wmem = sysctl_tcp_wmem,
2610 .sysctl_rmem = sysctl_tcp_rmem,
2611 .max_header = MAX_TCP_HEADER,
2612 .obj_size = sizeof(struct tcp_sock),
2613 .rsk_prot = &tcp_request_sock_ops,
2618 void __init tcp_v4_init(struct net_proto_family *ops)
2620 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2622 panic("Failed to create the TCP control socket.\n");
2623 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2624 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2626 /* Unhash it so that IP input processing does not even
2627 * see it, we do not wish this socket to see incoming
2630 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2633 EXPORT_SYMBOL(ipv4_specific);
2634 EXPORT_SYMBOL(tcp_bind_hash);
2635 EXPORT_SYMBOL(tcp_bucket_create);
2636 EXPORT_SYMBOL(tcp_hashinfo);
2637 EXPORT_SYMBOL(tcp_inherit_port);
2638 EXPORT_SYMBOL(tcp_listen_wlock);
2639 EXPORT_SYMBOL(tcp_port_rover);
2640 EXPORT_SYMBOL(tcp_prot);
2641 EXPORT_SYMBOL(tcp_put_port);
2642 EXPORT_SYMBOL(tcp_unhash);
2643 EXPORT_SYMBOL(tcp_v4_conn_request);
2644 EXPORT_SYMBOL(tcp_v4_connect);
2645 EXPORT_SYMBOL(tcp_v4_do_rcv);
2646 EXPORT_SYMBOL(tcp_v4_rebuild_header);
2647 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2648 EXPORT_SYMBOL(tcp_v4_send_check);
2649 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2651 #ifdef CONFIG_PROC_FS
2652 EXPORT_SYMBOL(tcp_proc_register);
2653 EXPORT_SYMBOL(tcp_proc_unregister);
2655 EXPORT_SYMBOL(sysctl_local_port_range);
2656 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2657 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);