[INET]: Generalise the TCP sock ID lookup routines
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97         .port_rover     = 1024 - 1,
98 };
99
100 /*
101  * This array holds the first and last local port number.
102  * For high-usage systems, use sysctl to change this to
103  * 32768-61000
104  */
105 int sysctl_local_port_range[2] = { 1024, 4999 };
106
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
108 {
109         const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
110         struct sock *sk2;
111         struct hlist_node *node;
112         int reuse = sk->sk_reuse;
113
114         sk_for_each_bound(sk2, node, &tb->owners) {
115                 if (sk != sk2 &&
116                     !tcp_v6_ipv6only(sk2) &&
117                     (!sk->sk_bound_dev_if ||
118                      !sk2->sk_bound_dev_if ||
119                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120                         if (!reuse || !sk2->sk_reuse ||
121                             sk2->sk_state == TCP_LISTEN) {
122                                 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
123                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124                                     sk2_rcv_saddr == sk_rcv_saddr)
125                                         break;
126                         }
127                 }
128         }
129         return node != NULL;
130 }
131
132 /* Obtain a reference to a local port for the given sock,
133  * if snum is zero it means select any available local port.
134  */
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
136 {
137         struct inet_bind_hashbucket *head;
138         struct hlist_node *node;
139         struct inet_bind_bucket *tb;
140         int ret;
141
142         local_bh_disable();
143         if (!snum) {
144                 int low = sysctl_local_port_range[0];
145                 int high = sysctl_local_port_range[1];
146                 int remaining = (high - low) + 1;
147                 int rover;
148
149                 spin_lock(&tcp_hashinfo.portalloc_lock);
150                 if (tcp_hashinfo.port_rover < low)
151                         rover = low;
152                 else
153                         rover = tcp_hashinfo.port_rover;
154                 do {
155                         rover++;
156                         if (rover > high)
157                                 rover = low;
158                         head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159                         spin_lock(&head->lock);
160                         inet_bind_bucket_for_each(tb, node, &head->chain)
161                                 if (tb->port == rover)
162                                         goto next;
163                         break;
164                 next:
165                         spin_unlock(&head->lock);
166                 } while (--remaining > 0);
167                 tcp_hashinfo.port_rover = rover;
168                 spin_unlock(&tcp_hashinfo.portalloc_lock);
169
170                 /* Exhausted local port range during search?  It is not
171                  * possible for us to be holding one of the bind hash
172                  * locks if this test triggers, because if 'remaining'
173                  * drops to zero, we broke out of the do/while loop at
174                  * the top level, not from the 'break;' statement.
175                  */
176                 ret = 1;
177                 if (unlikely(remaining <= 0))
178                         goto fail;
179
180                 /* OK, here is the one we will use.  HEAD is
181                  * non-NULL and we hold it's mutex.
182                  */
183                 snum = rover;
184         } else {
185                 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186                 spin_lock(&head->lock);
187                 inet_bind_bucket_for_each(tb, node, &head->chain)
188                         if (tb->port == snum)
189                                 goto tb_found;
190         }
191         tb = NULL;
192         goto tb_not_found;
193 tb_found:
194         if (!hlist_empty(&tb->owners)) {
195                 if (sk->sk_reuse > 1)
196                         goto success;
197                 if (tb->fastreuse > 0 &&
198                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
199                         goto success;
200                 } else {
201                         ret = 1;
202                         if (tcp_bind_conflict(sk, tb))
203                                 goto fail_unlock;
204                 }
205         }
206 tb_not_found:
207         ret = 1;
208         if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
209                 goto fail_unlock;
210         if (hlist_empty(&tb->owners)) {
211                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
212                         tb->fastreuse = 1;
213                 else
214                         tb->fastreuse = 0;
215         } else if (tb->fastreuse &&
216                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
217                 tb->fastreuse = 0;
218 success:
219         if (!inet_sk(sk)->bind_hash)
220                 inet_bind_hash(sk, tb, snum);
221         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
222         ret = 0;
223
224 fail_unlock:
225         spin_unlock(&head->lock);
226 fail:
227         local_bh_enable();
228         return ret;
229 }
230
231 static void tcp_v4_hash(struct sock *sk)
232 {
233         inet_hash(&tcp_hashinfo, sk);
234 }
235
236 void tcp_unhash(struct sock *sk)
237 {
238         inet_unhash(&tcp_hashinfo, sk);
239 }
240
241 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
242 {
243         return secure_tcp_sequence_number(skb->nh.iph->daddr,
244                                           skb->nh.iph->saddr,
245                                           skb->h.th->dest,
246                                           skb->h.th->source);
247 }
248
249 /* called with local bh disabled */
250 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
251                                       struct inet_timewait_sock **twp)
252 {
253         struct inet_sock *inet = inet_sk(sk);
254         u32 daddr = inet->rcv_saddr;
255         u32 saddr = inet->daddr;
256         int dif = sk->sk_bound_dev_if;
257         INET_ADDR_COOKIE(acookie, saddr, daddr)
258         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
259         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
260         struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
261         struct sock *sk2;
262         const struct hlist_node *node;
263         struct inet_timewait_sock *tw;
264
265         write_lock(&head->lock);
266
267         /* Check TIME-WAIT sockets first. */
268         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
269                 tw = inet_twsk(sk2);
270
271                 if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
272                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
273                         struct tcp_sock *tp = tcp_sk(sk);
274
275                         /* With PAWS, it is safe from the viewpoint
276                            of data integrity. Even without PAWS it
277                            is safe provided sequence spaces do not
278                            overlap i.e. at data rates <= 80Mbit/sec.
279
280                            Actually, the idea is close to VJ's one,
281                            only timestamp cache is held not per host,
282                            but per port pair and TW bucket is used
283                            as state holder.
284
285                            If TW bucket has been already destroyed we
286                            fall back to VJ's scheme and use initial
287                            timestamp retrieved from peer table.
288                          */
289                         if (tcptw->tw_ts_recent_stamp &&
290                             (!twp || (sysctl_tcp_tw_reuse &&
291                                       xtime.tv_sec -
292                                       tcptw->tw_ts_recent_stamp > 1))) {
293                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
294                                 if (tp->write_seq == 0)
295                                         tp->write_seq = 1;
296                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
297                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
298                                 sock_hold(sk2);
299                                 goto unique;
300                         } else
301                                 goto not_unique;
302                 }
303         }
304         tw = NULL;
305
306         /* And established part... */
307         sk_for_each(sk2, node, &head->chain) {
308                 if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
309                         goto not_unique;
310         }
311
312 unique:
313         /* Must record num and sport now. Otherwise we will see
314          * in hash table socket with a funny identity. */
315         inet->num = lport;
316         inet->sport = htons(lport);
317         sk->sk_hashent = hash;
318         BUG_TRAP(sk_unhashed(sk));
319         __sk_add_node(sk, &head->chain);
320         sock_prot_inc_use(sk->sk_prot);
321         write_unlock(&head->lock);
322
323         if (twp) {
324                 *twp = tw;
325                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
326         } else if (tw) {
327                 /* Silly. Should hash-dance instead... */
328                 tcp_tw_deschedule(tw);
329                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
330
331                 inet_twsk_put(tw);
332         }
333
334         return 0;
335
336 not_unique:
337         write_unlock(&head->lock);
338         return -EADDRNOTAVAIL;
339 }
340
341 static inline u32 connect_port_offset(const struct sock *sk)
342 {
343         const struct inet_sock *inet = inet_sk(sk);
344
345         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
346                                          inet->dport);
347 }
348
349 /*
350  * Bind a port for a connect operation and hash it.
351  */
352 static inline int tcp_v4_hash_connect(struct sock *sk)
353 {
354         const unsigned short snum = inet_sk(sk)->num;
355         struct inet_bind_hashbucket *head;
356         struct inet_bind_bucket *tb;
357         int ret;
358
359         if (!snum) {
360                 int low = sysctl_local_port_range[0];
361                 int high = sysctl_local_port_range[1];
362                 int range = high - low;
363                 int i;
364                 int port;
365                 static u32 hint;
366                 u32 offset = hint + connect_port_offset(sk);
367                 struct hlist_node *node;
368                 struct inet_timewait_sock *tw = NULL;
369
370                 local_bh_disable();
371                 for (i = 1; i <= range; i++) {
372                         port = low + (i + offset) % range;
373                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
374                         spin_lock(&head->lock);
375
376                         /* Does not bother with rcv_saddr checks,
377                          * because the established check is already
378                          * unique enough.
379                          */
380                         inet_bind_bucket_for_each(tb, node, &head->chain) {
381                                 if (tb->port == port) {
382                                         BUG_TRAP(!hlist_empty(&tb->owners));
383                                         if (tb->fastreuse >= 0)
384                                                 goto next_port;
385                                         if (!__tcp_v4_check_established(sk,
386                                                                         port,
387                                                                         &tw))
388                                                 goto ok;
389                                         goto next_port;
390                                 }
391                         }
392
393                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
394                         if (!tb) {
395                                 spin_unlock(&head->lock);
396                                 break;
397                         }
398                         tb->fastreuse = -1;
399                         goto ok;
400
401                 next_port:
402                         spin_unlock(&head->lock);
403                 }
404                 local_bh_enable();
405
406                 return -EADDRNOTAVAIL;
407
408 ok:
409                 hint += i;
410
411                 /* Head lock still held and bh's disabled */
412                 inet_bind_hash(sk, tb, port);
413                 if (sk_unhashed(sk)) {
414                         inet_sk(sk)->sport = htons(port);
415                         __inet_hash(&tcp_hashinfo, sk, 0);
416                 }
417                 spin_unlock(&head->lock);
418
419                 if (tw) {
420                         tcp_tw_deschedule(tw);
421                         inet_twsk_put(tw);
422                 }
423
424                 ret = 0;
425                 goto out;
426         }
427
428         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
429         tb  = inet_sk(sk)->bind_hash;
430         spin_lock_bh(&head->lock);
431         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
432                 __inet_hash(&tcp_hashinfo, sk, 0);
433                 spin_unlock_bh(&head->lock);
434                 return 0;
435         } else {
436                 spin_unlock(&head->lock);
437                 /* No definite answer... Walk to established hash table */
438                 ret = __tcp_v4_check_established(sk, snum, NULL);
439 out:
440                 local_bh_enable();
441                 return ret;
442         }
443 }
444
445 /* This will initiate an outgoing connection. */
446 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
447 {
448         struct inet_sock *inet = inet_sk(sk);
449         struct tcp_sock *tp = tcp_sk(sk);
450         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
451         struct rtable *rt;
452         u32 daddr, nexthop;
453         int tmp;
454         int err;
455
456         if (addr_len < sizeof(struct sockaddr_in))
457                 return -EINVAL;
458
459         if (usin->sin_family != AF_INET)
460                 return -EAFNOSUPPORT;
461
462         nexthop = daddr = usin->sin_addr.s_addr;
463         if (inet->opt && inet->opt->srr) {
464                 if (!daddr)
465                         return -EINVAL;
466                 nexthop = inet->opt->faddr;
467         }
468
469         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
470                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
471                                IPPROTO_TCP,
472                                inet->sport, usin->sin_port, sk);
473         if (tmp < 0)
474                 return tmp;
475
476         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
477                 ip_rt_put(rt);
478                 return -ENETUNREACH;
479         }
480
481         if (!inet->opt || !inet->opt->srr)
482                 daddr = rt->rt_dst;
483
484         if (!inet->saddr)
485                 inet->saddr = rt->rt_src;
486         inet->rcv_saddr = inet->saddr;
487
488         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
489                 /* Reset inherited state */
490                 tp->rx_opt.ts_recent       = 0;
491                 tp->rx_opt.ts_recent_stamp = 0;
492                 tp->write_seq              = 0;
493         }
494
495         if (sysctl_tcp_tw_recycle &&
496             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
497                 struct inet_peer *peer = rt_get_peer(rt);
498
499                 /* VJ's idea. We save last timestamp seen from
500                  * the destination in peer table, when entering state TIME-WAIT
501                  * and initialize rx_opt.ts_recent from it, when trying new connection.
502                  */
503
504                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
505                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
506                         tp->rx_opt.ts_recent = peer->tcp_ts;
507                 }
508         }
509
510         inet->dport = usin->sin_port;
511         inet->daddr = daddr;
512
513         tp->ext_header_len = 0;
514         if (inet->opt)
515                 tp->ext_header_len = inet->opt->optlen;
516
517         tp->rx_opt.mss_clamp = 536;
518
519         /* Socket identity is still unknown (sport may be zero).
520          * However we set state to SYN-SENT and not releasing socket
521          * lock select source port, enter ourselves into the hash tables and
522          * complete initialization after this.
523          */
524         tcp_set_state(sk, TCP_SYN_SENT);
525         err = tcp_v4_hash_connect(sk);
526         if (err)
527                 goto failure;
528
529         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
530         if (err)
531                 goto failure;
532
533         /* OK, now commit destination to socket.  */
534         sk_setup_caps(sk, &rt->u.dst);
535
536         if (!tp->write_seq)
537                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
538                                                            inet->daddr,
539                                                            inet->sport,
540                                                            usin->sin_port);
541
542         inet->id = tp->write_seq ^ jiffies;
543
544         err = tcp_connect(sk);
545         rt = NULL;
546         if (err)
547                 goto failure;
548
549         return 0;
550
551 failure:
552         /* This unhashes the socket and releases the local port, if necessary. */
553         tcp_set_state(sk, TCP_CLOSE);
554         ip_rt_put(rt);
555         sk->sk_route_caps = 0;
556         inet->dport = 0;
557         return err;
558 }
559
560 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
561 {
562         return ((struct rtable *)skb->dst)->rt_iif;
563 }
564
565 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
566 {
567         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
568 }
569
570 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
571                                               struct request_sock ***prevp,
572                                               __u16 rport,
573                                               __u32 raddr, __u32 laddr)
574 {
575         struct listen_sock *lopt = tp->accept_queue.listen_opt;
576         struct request_sock *req, **prev;
577
578         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
579              (req = *prev) != NULL;
580              prev = &req->dl_next) {
581                 const struct inet_request_sock *ireq = inet_rsk(req);
582
583                 if (ireq->rmt_port == rport &&
584                     ireq->rmt_addr == raddr &&
585                     ireq->loc_addr == laddr &&
586                     TCP_INET_FAMILY(req->rsk_ops->family)) {
587                         BUG_TRAP(!req->sk);
588                         *prevp = prev;
589                         break;
590                 }
591         }
592
593         return req;
594 }
595
596 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
597 {
598         struct tcp_sock *tp = tcp_sk(sk);
599         struct listen_sock *lopt = tp->accept_queue.listen_opt;
600         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
601
602         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
603         tcp_synq_added(sk);
604 }
605
606
607 /*
608  * This routine does path mtu discovery as defined in RFC1191.
609  */
610 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
611                                      u32 mtu)
612 {
613         struct dst_entry *dst;
614         struct inet_sock *inet = inet_sk(sk);
615         struct tcp_sock *tp = tcp_sk(sk);
616
617         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
618          * send out by Linux are always <576bytes so they should go through
619          * unfragmented).
620          */
621         if (sk->sk_state == TCP_LISTEN)
622                 return;
623
624         /* We don't check in the destentry if pmtu discovery is forbidden
625          * on this route. We just assume that no packet_to_big packets
626          * are send back when pmtu discovery is not active.
627          * There is a small race when the user changes this flag in the
628          * route, but I think that's acceptable.
629          */
630         if ((dst = __sk_dst_check(sk, 0)) == NULL)
631                 return;
632
633         dst->ops->update_pmtu(dst, mtu);
634
635         /* Something is about to be wrong... Remember soft error
636          * for the case, if this connection will not able to recover.
637          */
638         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
639                 sk->sk_err_soft = EMSGSIZE;
640
641         mtu = dst_mtu(dst);
642
643         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
644             tp->pmtu_cookie > mtu) {
645                 tcp_sync_mss(sk, mtu);
646
647                 /* Resend the TCP packet because it's
648                  * clear that the old packet has been
649                  * dropped. This is the new "fast" path mtu
650                  * discovery.
651                  */
652                 tcp_simple_retransmit(sk);
653         } /* else let the usual retransmit timer handle it */
654 }
655
656 /*
657  * This routine is called by the ICMP module when it gets some
658  * sort of error condition.  If err < 0 then the socket should
659  * be closed and the error returned to the user.  If err > 0
660  * it's just the icmp type << 8 | icmp code.  After adjustment
661  * header points to the first 8 bytes of the tcp header.  We need
662  * to find the appropriate port.
663  *
664  * The locking strategy used here is very "optimistic". When
665  * someone else accesses the socket the ICMP is just dropped
666  * and for some paths there is no check at all.
667  * A more general error queue to queue errors for later handling
668  * is probably better.
669  *
670  */
671
672 void tcp_v4_err(struct sk_buff *skb, u32 info)
673 {
674         struct iphdr *iph = (struct iphdr *)skb->data;
675         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
676         struct tcp_sock *tp;
677         struct inet_sock *inet;
678         int type = skb->h.icmph->type;
679         int code = skb->h.icmph->code;
680         struct sock *sk;
681         __u32 seq;
682         int err;
683
684         if (skb->len < (iph->ihl << 2) + 8) {
685                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
686                 return;
687         }
688
689         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
690                          th->source, tcp_v4_iif(skb));
691         if (!sk) {
692                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
693                 return;
694         }
695         if (sk->sk_state == TCP_TIME_WAIT) {
696                 inet_twsk_put((struct inet_timewait_sock *)sk);
697                 return;
698         }
699
700         bh_lock_sock(sk);
701         /* If too many ICMPs get dropped on busy
702          * servers this needs to be solved differently.
703          */
704         if (sock_owned_by_user(sk))
705                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
706
707         if (sk->sk_state == TCP_CLOSE)
708                 goto out;
709
710         tp = tcp_sk(sk);
711         seq = ntohl(th->seq);
712         if (sk->sk_state != TCP_LISTEN &&
713             !between(seq, tp->snd_una, tp->snd_nxt)) {
714                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
715                 goto out;
716         }
717
718         switch (type) {
719         case ICMP_SOURCE_QUENCH:
720                 /* Just silently ignore these. */
721                 goto out;
722         case ICMP_PARAMETERPROB:
723                 err = EPROTO;
724                 break;
725         case ICMP_DEST_UNREACH:
726                 if (code > NR_ICMP_UNREACH)
727                         goto out;
728
729                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
730                         if (!sock_owned_by_user(sk))
731                                 do_pmtu_discovery(sk, iph, info);
732                         goto out;
733                 }
734
735                 err = icmp_err_convert[code].errno;
736                 break;
737         case ICMP_TIME_EXCEEDED:
738                 err = EHOSTUNREACH;
739                 break;
740         default:
741                 goto out;
742         }
743
744         switch (sk->sk_state) {
745                 struct request_sock *req, **prev;
746         case TCP_LISTEN:
747                 if (sock_owned_by_user(sk))
748                         goto out;
749
750                 req = tcp_v4_search_req(tp, &prev, th->dest,
751                                         iph->daddr, iph->saddr);
752                 if (!req)
753                         goto out;
754
755                 /* ICMPs are not backlogged, hence we cannot get
756                    an established socket here.
757                  */
758                 BUG_TRAP(!req->sk);
759
760                 if (seq != tcp_rsk(req)->snt_isn) {
761                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
762                         goto out;
763                 }
764
765                 /*
766                  * Still in SYN_RECV, just remove it silently.
767                  * There is no good way to pass the error to the newly
768                  * created socket, and POSIX does not want network
769                  * errors returned from accept().
770                  */
771                 tcp_synq_drop(sk, req, prev);
772                 goto out;
773
774         case TCP_SYN_SENT:
775         case TCP_SYN_RECV:  /* Cannot happen.
776                                It can f.e. if SYNs crossed.
777                              */
778                 if (!sock_owned_by_user(sk)) {
779                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
780                         sk->sk_err = err;
781
782                         sk->sk_error_report(sk);
783
784                         tcp_done(sk);
785                 } else {
786                         sk->sk_err_soft = err;
787                 }
788                 goto out;
789         }
790
791         /* If we've already connected we will keep trying
792          * until we time out, or the user gives up.
793          *
794          * rfc1122 4.2.3.9 allows to consider as hard errors
795          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
796          * but it is obsoleted by pmtu discovery).
797          *
798          * Note, that in modern internet, where routing is unreliable
799          * and in each dark corner broken firewalls sit, sending random
800          * errors ordered by their masters even this two messages finally lose
801          * their original sense (even Linux sends invalid PORT_UNREACHs)
802          *
803          * Now we are in compliance with RFCs.
804          *                                                      --ANK (980905)
805          */
806
807         inet = inet_sk(sk);
808         if (!sock_owned_by_user(sk) && inet->recverr) {
809                 sk->sk_err = err;
810                 sk->sk_error_report(sk);
811         } else  { /* Only an error on timeout */
812                 sk->sk_err_soft = err;
813         }
814
815 out:
816         bh_unlock_sock(sk);
817         sock_put(sk);
818 }
819
820 /* This routine computes an IPv4 TCP checksum. */
821 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
822                        struct sk_buff *skb)
823 {
824         struct inet_sock *inet = inet_sk(sk);
825
826         if (skb->ip_summed == CHECKSUM_HW) {
827                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
828                 skb->csum = offsetof(struct tcphdr, check);
829         } else {
830                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
831                                          csum_partial((char *)th,
832                                                       th->doff << 2,
833                                                       skb->csum));
834         }
835 }
836
837 /*
838  *      This routine will send an RST to the other tcp.
839  *
840  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
841  *                    for reset.
842  *      Answer: if a packet caused RST, it is not for a socket
843  *              existing in our system, if it is matched to a socket,
844  *              it is just duplicate segment or bug in other side's TCP.
845  *              So that we build reply only basing on parameters
846  *              arrived with segment.
847  *      Exception: precedence violation. We do not implement it in any case.
848  */
849
850 static void tcp_v4_send_reset(struct sk_buff *skb)
851 {
852         struct tcphdr *th = skb->h.th;
853         struct tcphdr rth;
854         struct ip_reply_arg arg;
855
856         /* Never send a reset in response to a reset. */
857         if (th->rst)
858                 return;
859
860         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
861                 return;
862
863         /* Swap the send and the receive. */
864         memset(&rth, 0, sizeof(struct tcphdr));
865         rth.dest   = th->source;
866         rth.source = th->dest;
867         rth.doff   = sizeof(struct tcphdr) / 4;
868         rth.rst    = 1;
869
870         if (th->ack) {
871                 rth.seq = th->ack_seq;
872         } else {
873                 rth.ack = 1;
874                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
875                                     skb->len - (th->doff << 2));
876         }
877
878         memset(&arg, 0, sizeof arg);
879         arg.iov[0].iov_base = (unsigned char *)&rth;
880         arg.iov[0].iov_len  = sizeof rth;
881         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
882                                       skb->nh.iph->saddr, /*XXX*/
883                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
884         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
885
886         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
887
888         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
889         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
890 }
891
892 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
893    outside socket context is ugly, certainly. What can I do?
894  */
895
896 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
897                             u32 win, u32 ts)
898 {
899         struct tcphdr *th = skb->h.th;
900         struct {
901                 struct tcphdr th;
902                 u32 tsopt[3];
903         } rep;
904         struct ip_reply_arg arg;
905
906         memset(&rep.th, 0, sizeof(struct tcphdr));
907         memset(&arg, 0, sizeof arg);
908
909         arg.iov[0].iov_base = (unsigned char *)&rep;
910         arg.iov[0].iov_len  = sizeof(rep.th);
911         if (ts) {
912                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
913                                      (TCPOPT_TIMESTAMP << 8) |
914                                      TCPOLEN_TIMESTAMP);
915                 rep.tsopt[1] = htonl(tcp_time_stamp);
916                 rep.tsopt[2] = htonl(ts);
917                 arg.iov[0].iov_len = sizeof(rep);
918         }
919
920         /* Swap the send and the receive. */
921         rep.th.dest    = th->source;
922         rep.th.source  = th->dest;
923         rep.th.doff    = arg.iov[0].iov_len / 4;
924         rep.th.seq     = htonl(seq);
925         rep.th.ack_seq = htonl(ack);
926         rep.th.ack     = 1;
927         rep.th.window  = htons(win);
928
929         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
930                                       skb->nh.iph->saddr, /*XXX*/
931                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
932         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
933
934         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
935
936         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
937 }
938
939 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
940 {
941         struct inet_timewait_sock *tw = inet_twsk(sk);
942         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
943
944         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
945                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
946
947         inet_twsk_put(tw);
948 }
949
950 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
951 {
952         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
953                         req->ts_recent);
954 }
955
956 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
957                                           struct request_sock *req)
958 {
959         struct rtable *rt;
960         const struct inet_request_sock *ireq = inet_rsk(req);
961         struct ip_options *opt = inet_rsk(req)->opt;
962         struct flowi fl = { .oif = sk->sk_bound_dev_if,
963                             .nl_u = { .ip4_u =
964                                       { .daddr = ((opt && opt->srr) ?
965                                                   opt->faddr :
966                                                   ireq->rmt_addr),
967                                         .saddr = ireq->loc_addr,
968                                         .tos = RT_CONN_FLAGS(sk) } },
969                             .proto = IPPROTO_TCP,
970                             .uli_u = { .ports =
971                                        { .sport = inet_sk(sk)->sport,
972                                          .dport = ireq->rmt_port } } };
973
974         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
975                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
976                 return NULL;
977         }
978         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
979                 ip_rt_put(rt);
980                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
981                 return NULL;
982         }
983         return &rt->u.dst;
984 }
985
986 /*
987  *      Send a SYN-ACK after having received an ACK.
988  *      This still operates on a request_sock only, not on a big
989  *      socket.
990  */
991 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
992                               struct dst_entry *dst)
993 {
994         const struct inet_request_sock *ireq = inet_rsk(req);
995         int err = -1;
996         struct sk_buff * skb;
997
998         /* First, grab a route. */
999         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1000                 goto out;
1001
1002         skb = tcp_make_synack(sk, dst, req);
1003
1004         if (skb) {
1005                 struct tcphdr *th = skb->h.th;
1006
1007                 th->check = tcp_v4_check(th, skb->len,
1008                                          ireq->loc_addr,
1009                                          ireq->rmt_addr,
1010                                          csum_partial((char *)th, skb->len,
1011                                                       skb->csum));
1012
1013                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1014                                             ireq->rmt_addr,
1015                                             ireq->opt);
1016                 if (err == NET_XMIT_CN)
1017                         err = 0;
1018         }
1019
1020 out:
1021         dst_release(dst);
1022         return err;
1023 }
1024
1025 /*
1026  *      IPv4 request_sock destructor.
1027  */
1028 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1029 {
1030         if (inet_rsk(req)->opt)
1031                 kfree(inet_rsk(req)->opt);
1032 }
1033
1034 static inline void syn_flood_warning(struct sk_buff *skb)
1035 {
1036         static unsigned long warntime;
1037
1038         if (time_after(jiffies, (warntime + HZ * 60))) {
1039                 warntime = jiffies;
1040                 printk(KERN_INFO
1041                        "possible SYN flooding on port %d. Sending cookies.\n",
1042                        ntohs(skb->h.th->dest));
1043         }
1044 }
1045
1046 /*
1047  * Save and compile IPv4 options into the request_sock if needed.
1048  */
1049 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1050                                                      struct sk_buff *skb)
1051 {
1052         struct ip_options *opt = &(IPCB(skb)->opt);
1053         struct ip_options *dopt = NULL;
1054
1055         if (opt && opt->optlen) {
1056                 int opt_size = optlength(opt);
1057                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1058                 if (dopt) {
1059                         if (ip_options_echo(dopt, skb)) {
1060                                 kfree(dopt);
1061                                 dopt = NULL;
1062                         }
1063                 }
1064         }
1065         return dopt;
1066 }
1067
1068 struct request_sock_ops tcp_request_sock_ops = {
1069         .family         =       PF_INET,
1070         .obj_size       =       sizeof(struct tcp_request_sock),
1071         .rtx_syn_ack    =       tcp_v4_send_synack,
1072         .send_ack       =       tcp_v4_reqsk_send_ack,
1073         .destructor     =       tcp_v4_reqsk_destructor,
1074         .send_reset     =       tcp_v4_send_reset,
1075 };
1076
1077 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1078 {
1079         struct inet_request_sock *ireq;
1080         struct tcp_options_received tmp_opt;
1081         struct request_sock *req;
1082         __u32 saddr = skb->nh.iph->saddr;
1083         __u32 daddr = skb->nh.iph->daddr;
1084         __u32 isn = TCP_SKB_CB(skb)->when;
1085         struct dst_entry *dst = NULL;
1086 #ifdef CONFIG_SYN_COOKIES
1087         int want_cookie = 0;
1088 #else
1089 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1090 #endif
1091
1092         /* Never answer to SYNs send to broadcast or multicast */
1093         if (((struct rtable *)skb->dst)->rt_flags &
1094             (RTCF_BROADCAST | RTCF_MULTICAST))
1095                 goto drop;
1096
1097         /* TW buckets are converted to open requests without
1098          * limitations, they conserve resources and peer is
1099          * evidently real one.
1100          */
1101         if (tcp_synq_is_full(sk) && !isn) {
1102 #ifdef CONFIG_SYN_COOKIES
1103                 if (sysctl_tcp_syncookies) {
1104                         want_cookie = 1;
1105                 } else
1106 #endif
1107                 goto drop;
1108         }
1109
1110         /* Accept backlog is full. If we have already queued enough
1111          * of warm entries in syn queue, drop request. It is better than
1112          * clogging syn queue with openreqs with exponentially increasing
1113          * timeout.
1114          */
1115         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1116                 goto drop;
1117
1118         req = reqsk_alloc(&tcp_request_sock_ops);
1119         if (!req)
1120                 goto drop;
1121
1122         tcp_clear_options(&tmp_opt);
1123         tmp_opt.mss_clamp = 536;
1124         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1125
1126         tcp_parse_options(skb, &tmp_opt, 0);
1127
1128         if (want_cookie) {
1129                 tcp_clear_options(&tmp_opt);
1130                 tmp_opt.saw_tstamp = 0;
1131         }
1132
1133         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1134                 /* Some OSes (unknown ones, but I see them on web server, which
1135                  * contains information interesting only for windows'
1136                  * users) do not send their stamp in SYN. It is easy case.
1137                  * We simply do not advertise TS support.
1138                  */
1139                 tmp_opt.saw_tstamp = 0;
1140                 tmp_opt.tstamp_ok  = 0;
1141         }
1142         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1143
1144         tcp_openreq_init(req, &tmp_opt, skb);
1145
1146         ireq = inet_rsk(req);
1147         ireq->loc_addr = daddr;
1148         ireq->rmt_addr = saddr;
1149         ireq->opt = tcp_v4_save_options(sk, skb);
1150         if (!want_cookie)
1151                 TCP_ECN_create_request(req, skb->h.th);
1152
1153         if (want_cookie) {
1154 #ifdef CONFIG_SYN_COOKIES
1155                 syn_flood_warning(skb);
1156 #endif
1157                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1158         } else if (!isn) {
1159                 struct inet_peer *peer = NULL;
1160
1161                 /* VJ's idea. We save last timestamp seen
1162                  * from the destination in peer table, when entering
1163                  * state TIME-WAIT, and check against it before
1164                  * accepting new connection request.
1165                  *
1166                  * If "isn" is not zero, this request hit alive
1167                  * timewait bucket, so that all the necessary checks
1168                  * are made in the function processing timewait state.
1169                  */
1170                 if (tmp_opt.saw_tstamp &&
1171                     sysctl_tcp_tw_recycle &&
1172                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1173                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1174                     peer->v4daddr == saddr) {
1175                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1176                             (s32)(peer->tcp_ts - req->ts_recent) >
1177                                                         TCP_PAWS_WINDOW) {
1178                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1179                                 dst_release(dst);
1180                                 goto drop_and_free;
1181                         }
1182                 }
1183                 /* Kill the following clause, if you dislike this way. */
1184                 else if (!sysctl_tcp_syncookies &&
1185                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1186                           (sysctl_max_syn_backlog >> 2)) &&
1187                          (!peer || !peer->tcp_ts_stamp) &&
1188                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1189                         /* Without syncookies last quarter of
1190                          * backlog is filled with destinations,
1191                          * proven to be alive.
1192                          * It means that we continue to communicate
1193                          * to destinations, already remembered
1194                          * to the moment of synflood.
1195                          */
1196                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1197                                               "request from %u.%u."
1198                                               "%u.%u/%u\n",
1199                                               NIPQUAD(saddr),
1200                                               ntohs(skb->h.th->source)));
1201                         dst_release(dst);
1202                         goto drop_and_free;
1203                 }
1204
1205                 isn = tcp_v4_init_sequence(sk, skb);
1206         }
1207         tcp_rsk(req)->snt_isn = isn;
1208
1209         if (tcp_v4_send_synack(sk, req, dst))
1210                 goto drop_and_free;
1211
1212         if (want_cookie) {
1213                 reqsk_free(req);
1214         } else {
1215                 tcp_v4_synq_add(sk, req);
1216         }
1217         return 0;
1218
1219 drop_and_free:
1220         reqsk_free(req);
1221 drop:
1222         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1223         return 0;
1224 }
1225
1226
1227 /*
1228  * The three way handshake has completed - we got a valid synack -
1229  * now create the new socket.
1230  */
1231 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1232                                   struct request_sock *req,
1233                                   struct dst_entry *dst)
1234 {
1235         struct inet_request_sock *ireq;
1236         struct inet_sock *newinet;
1237         struct tcp_sock *newtp;
1238         struct sock *newsk;
1239
1240         if (sk_acceptq_is_full(sk))
1241                 goto exit_overflow;
1242
1243         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1244                 goto exit;
1245
1246         newsk = tcp_create_openreq_child(sk, req, skb);
1247         if (!newsk)
1248                 goto exit;
1249
1250         sk_setup_caps(newsk, dst);
1251
1252         newtp                 = tcp_sk(newsk);
1253         newinet               = inet_sk(newsk);
1254         ireq                  = inet_rsk(req);
1255         newinet->daddr        = ireq->rmt_addr;
1256         newinet->rcv_saddr    = ireq->loc_addr;
1257         newinet->saddr        = ireq->loc_addr;
1258         newinet->opt          = ireq->opt;
1259         ireq->opt             = NULL;
1260         newinet->mc_index     = tcp_v4_iif(skb);
1261         newinet->mc_ttl       = skb->nh.iph->ttl;
1262         newtp->ext_header_len = 0;
1263         if (newinet->opt)
1264                 newtp->ext_header_len = newinet->opt->optlen;
1265         newinet->id = newtp->write_seq ^ jiffies;
1266
1267         tcp_sync_mss(newsk, dst_mtu(dst));
1268         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1269         tcp_initialize_rcv_mss(newsk);
1270
1271         __inet_hash(&tcp_hashinfo, newsk, 0);
1272         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1273
1274         return newsk;
1275
1276 exit_overflow:
1277         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1278 exit:
1279         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1280         dst_release(dst);
1281         return NULL;
1282 }
1283
1284 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1285 {
1286         struct tcphdr *th = skb->h.th;
1287         struct iphdr *iph = skb->nh.iph;
1288         struct tcp_sock *tp = tcp_sk(sk);
1289         struct sock *nsk;
1290         struct request_sock **prev;
1291         /* Find possible connection requests. */
1292         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1293                                                      iph->saddr, iph->daddr);
1294         if (req)
1295                 return tcp_check_req(sk, skb, req, prev);
1296
1297         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1298                                         th->source, skb->nh.iph->daddr,
1299                                         ntohs(th->dest), tcp_v4_iif(skb));
1300
1301         if (nsk) {
1302                 if (nsk->sk_state != TCP_TIME_WAIT) {
1303                         bh_lock_sock(nsk);
1304                         return nsk;
1305                 }
1306                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1307                 return NULL;
1308         }
1309
1310 #ifdef CONFIG_SYN_COOKIES
1311         if (!th->rst && !th->syn && th->ack)
1312                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1313 #endif
1314         return sk;
1315 }
1316
1317 static int tcp_v4_checksum_init(struct sk_buff *skb)
1318 {
1319         if (skb->ip_summed == CHECKSUM_HW) {
1320                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1321                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1322                                   skb->nh.iph->daddr, skb->csum))
1323                         return 0;
1324
1325                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1326                 skb->ip_summed = CHECKSUM_NONE;
1327         }
1328         if (skb->len <= 76) {
1329                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1330                                  skb->nh.iph->daddr,
1331                                  skb_checksum(skb, 0, skb->len, 0)))
1332                         return -1;
1333                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1334         } else {
1335                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1336                                           skb->nh.iph->saddr,
1337                                           skb->nh.iph->daddr, 0);
1338         }
1339         return 0;
1340 }
1341
1342
1343 /* The socket must have it's spinlock held when we get
1344  * here.
1345  *
1346  * We have a potential double-lock case here, so even when
1347  * doing backlog processing we use the BH locking scheme.
1348  * This is because we cannot sleep with the original spinlock
1349  * held.
1350  */
1351 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1352 {
1353         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1354                 TCP_CHECK_TIMER(sk);
1355                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1356                         goto reset;
1357                 TCP_CHECK_TIMER(sk);
1358                 return 0;
1359         }
1360
1361         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1362                 goto csum_err;
1363
1364         if (sk->sk_state == TCP_LISTEN) {
1365                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1366                 if (!nsk)
1367                         goto discard;
1368
1369                 if (nsk != sk) {
1370                         if (tcp_child_process(sk, nsk, skb))
1371                                 goto reset;
1372                         return 0;
1373                 }
1374         }
1375
1376         TCP_CHECK_TIMER(sk);
1377         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1378                 goto reset;
1379         TCP_CHECK_TIMER(sk);
1380         return 0;
1381
1382 reset:
1383         tcp_v4_send_reset(skb);
1384 discard:
1385         kfree_skb(skb);
1386         /* Be careful here. If this function gets more complicated and
1387          * gcc suffers from register pressure on the x86, sk (in %ebx)
1388          * might be destroyed here. This current version compiles correctly,
1389          * but you have been warned.
1390          */
1391         return 0;
1392
1393 csum_err:
1394         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1395         goto discard;
1396 }
1397
1398 /*
1399  *      From tcp_input.c
1400  */
1401
1402 int tcp_v4_rcv(struct sk_buff *skb)
1403 {
1404         struct tcphdr *th;
1405         struct sock *sk;
1406         int ret;
1407
1408         if (skb->pkt_type != PACKET_HOST)
1409                 goto discard_it;
1410
1411         /* Count it even if it's bad */
1412         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1413
1414         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1415                 goto discard_it;
1416
1417         th = skb->h.th;
1418
1419         if (th->doff < sizeof(struct tcphdr) / 4)
1420                 goto bad_packet;
1421         if (!pskb_may_pull(skb, th->doff * 4))
1422                 goto discard_it;
1423
1424         /* An explanation is required here, I think.
1425          * Packet length and doff are validated by header prediction,
1426          * provided case of th->doff==0 is elimineted.
1427          * So, we defer the checks. */
1428         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1429              tcp_v4_checksum_init(skb) < 0))
1430                 goto bad_packet;
1431
1432         th = skb->h.th;
1433         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1434         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1435                                     skb->len - th->doff * 4);
1436         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1437         TCP_SKB_CB(skb)->when    = 0;
1438         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1439         TCP_SKB_CB(skb)->sacked  = 0;
1440
1441         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1442                            skb->nh.iph->daddr, ntohs(th->dest),
1443                            tcp_v4_iif(skb));
1444
1445         if (!sk)
1446                 goto no_tcp_socket;
1447
1448 process:
1449         if (sk->sk_state == TCP_TIME_WAIT)
1450                 goto do_time_wait;
1451
1452         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1453                 goto discard_and_relse;
1454
1455         if (sk_filter(sk, skb, 0))
1456                 goto discard_and_relse;
1457
1458         skb->dev = NULL;
1459
1460         bh_lock_sock(sk);
1461         ret = 0;
1462         if (!sock_owned_by_user(sk)) {
1463                 if (!tcp_prequeue(sk, skb))
1464                         ret = tcp_v4_do_rcv(sk, skb);
1465         } else
1466                 sk_add_backlog(sk, skb);
1467         bh_unlock_sock(sk);
1468
1469         sock_put(sk);
1470
1471         return ret;
1472
1473 no_tcp_socket:
1474         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1475                 goto discard_it;
1476
1477         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1478 bad_packet:
1479                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1480         } else {
1481                 tcp_v4_send_reset(skb);
1482         }
1483
1484 discard_it:
1485         /* Discard frame. */
1486         kfree_skb(skb);
1487         return 0;
1488
1489 discard_and_relse:
1490         sock_put(sk);
1491         goto discard_it;
1492
1493 do_time_wait:
1494         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1495                 inet_twsk_put((struct inet_timewait_sock *) sk);
1496                 goto discard_it;
1497         }
1498
1499         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1500                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1501                 inet_twsk_put((struct inet_timewait_sock *) sk);
1502                 goto discard_it;
1503         }
1504         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1505                                            skb, th)) {
1506         case TCP_TW_SYN: {
1507                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1508                                                         skb->nh.iph->daddr,
1509                                                         ntohs(th->dest),
1510                                                         tcp_v4_iif(skb));
1511                 if (sk2) {
1512                         tcp_tw_deschedule((struct inet_timewait_sock *)sk);
1513                         inet_twsk_put((struct inet_timewait_sock *)sk);
1514                         sk = sk2;
1515                         goto process;
1516                 }
1517                 /* Fall through to ACK */
1518         }
1519         case TCP_TW_ACK:
1520                 tcp_v4_timewait_ack(sk, skb);
1521                 break;
1522         case TCP_TW_RST:
1523                 goto no_tcp_socket;
1524         case TCP_TW_SUCCESS:;
1525         }
1526         goto discard_it;
1527 }
1528
1529 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1530 {
1531         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1532         struct inet_sock *inet = inet_sk(sk);
1533
1534         sin->sin_family         = AF_INET;
1535         sin->sin_addr.s_addr    = inet->daddr;
1536         sin->sin_port           = inet->dport;
1537 }
1538
1539 /* VJ's idea. Save last timestamp seen from this destination
1540  * and hold it at least for normal timewait interval to use for duplicate
1541  * segment detection in subsequent connections, before they enter synchronized
1542  * state.
1543  */
1544
1545 int tcp_v4_remember_stamp(struct sock *sk)
1546 {
1547         struct inet_sock *inet = inet_sk(sk);
1548         struct tcp_sock *tp = tcp_sk(sk);
1549         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1550         struct inet_peer *peer = NULL;
1551         int release_it = 0;
1552
1553         if (!rt || rt->rt_dst != inet->daddr) {
1554                 peer = inet_getpeer(inet->daddr, 1);
1555                 release_it = 1;
1556         } else {
1557                 if (!rt->peer)
1558                         rt_bind_peer(rt, 1);
1559                 peer = rt->peer;
1560         }
1561
1562         if (peer) {
1563                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1564                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1565                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1566                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1567                         peer->tcp_ts = tp->rx_opt.ts_recent;
1568                 }
1569                 if (release_it)
1570                         inet_putpeer(peer);
1571                 return 1;
1572         }
1573
1574         return 0;
1575 }
1576
1577 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1578 {
1579         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1580
1581         if (peer) {
1582                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1583
1584                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1585                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1586                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1587                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1588                         peer->tcp_ts       = tcptw->tw_ts_recent;
1589                 }
1590                 inet_putpeer(peer);
1591                 return 1;
1592         }
1593
1594         return 0;
1595 }
1596
1597 struct tcp_func ipv4_specific = {
1598         .queue_xmit     =       ip_queue_xmit,
1599         .send_check     =       tcp_v4_send_check,
1600         .rebuild_header =       inet_sk_rebuild_header,
1601         .conn_request   =       tcp_v4_conn_request,
1602         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1603         .remember_stamp =       tcp_v4_remember_stamp,
1604         .net_header_len =       sizeof(struct iphdr),
1605         .setsockopt     =       ip_setsockopt,
1606         .getsockopt     =       ip_getsockopt,
1607         .addr2sockaddr  =       v4_addr2sockaddr,
1608         .sockaddr_len   =       sizeof(struct sockaddr_in),
1609 };
1610
1611 /* NOTE: A lot of things set to zero explicitly by call to
1612  *       sk_alloc() so need not be done here.
1613  */
1614 static int tcp_v4_init_sock(struct sock *sk)
1615 {
1616         struct tcp_sock *tp = tcp_sk(sk);
1617
1618         skb_queue_head_init(&tp->out_of_order_queue);
1619         tcp_init_xmit_timers(sk);
1620         tcp_prequeue_init(tp);
1621
1622         tp->rto  = TCP_TIMEOUT_INIT;
1623         tp->mdev = TCP_TIMEOUT_INIT;
1624
1625         /* So many TCP implementations out there (incorrectly) count the
1626          * initial SYN frame in their delayed-ACK and congestion control
1627          * algorithms that we must have the following bandaid to talk
1628          * efficiently to them.  -DaveM
1629          */
1630         tp->snd_cwnd = 2;
1631
1632         /* See draft-stevens-tcpca-spec-01 for discussion of the
1633          * initialization of these values.
1634          */
1635         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1636         tp->snd_cwnd_clamp = ~0;
1637         tp->mss_cache = 536;
1638
1639         tp->reordering = sysctl_tcp_reordering;
1640         tp->ca_ops = &tcp_init_congestion_ops;
1641
1642         sk->sk_state = TCP_CLOSE;
1643
1644         sk->sk_write_space = sk_stream_write_space;
1645         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1646
1647         tp->af_specific = &ipv4_specific;
1648
1649         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1650         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1651
1652         atomic_inc(&tcp_sockets_allocated);
1653
1654         return 0;
1655 }
1656
1657 int tcp_v4_destroy_sock(struct sock *sk)
1658 {
1659         struct tcp_sock *tp = tcp_sk(sk);
1660
1661         tcp_clear_xmit_timers(sk);
1662
1663         tcp_cleanup_congestion_control(tp);
1664
1665         /* Cleanup up the write buffer. */
1666         sk_stream_writequeue_purge(sk);
1667
1668         /* Cleans up our, hopefully empty, out_of_order_queue. */
1669         __skb_queue_purge(&tp->out_of_order_queue);
1670
1671         /* Clean prequeue, it must be empty really */
1672         __skb_queue_purge(&tp->ucopy.prequeue);
1673
1674         /* Clean up a referenced TCP bind bucket. */
1675         if (inet_sk(sk)->bind_hash)
1676                 inet_put_port(&tcp_hashinfo, sk);
1677
1678         /*
1679          * If sendmsg cached page exists, toss it.
1680          */
1681         if (sk->sk_sndmsg_page) {
1682                 __free_page(sk->sk_sndmsg_page);
1683                 sk->sk_sndmsg_page = NULL;
1684         }
1685
1686         atomic_dec(&tcp_sockets_allocated);
1687
1688         return 0;
1689 }
1690
1691 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1692
1693 #ifdef CONFIG_PROC_FS
1694 /* Proc filesystem TCP sock list dumping. */
1695
1696 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1697 {
1698         return hlist_empty(head) ? NULL :
1699                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1700 }
1701
1702 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1703 {
1704         return tw->tw_node.next ?
1705                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1706 }
1707
1708 static void *listening_get_next(struct seq_file *seq, void *cur)
1709 {
1710         struct tcp_sock *tp;
1711         struct hlist_node *node;
1712         struct sock *sk = cur;
1713         struct tcp_iter_state* st = seq->private;
1714
1715         if (!sk) {
1716                 st->bucket = 0;
1717                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1718                 goto get_sk;
1719         }
1720
1721         ++st->num;
1722
1723         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1724                 struct request_sock *req = cur;
1725
1726                 tp = tcp_sk(st->syn_wait_sk);
1727                 req = req->dl_next;
1728                 while (1) {
1729                         while (req) {
1730                                 if (req->rsk_ops->family == st->family) {
1731                                         cur = req;
1732                                         goto out;
1733                                 }
1734                                 req = req->dl_next;
1735                         }
1736                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1737                                 break;
1738 get_req:
1739                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1740                 }
1741                 sk        = sk_next(st->syn_wait_sk);
1742                 st->state = TCP_SEQ_STATE_LISTENING;
1743                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1744         } else {
1745                 tp = tcp_sk(sk);
1746                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1747                 if (reqsk_queue_len(&tp->accept_queue))
1748                         goto start_req;
1749                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1750                 sk = sk_next(sk);
1751         }
1752 get_sk:
1753         sk_for_each_from(sk, node) {
1754                 if (sk->sk_family == st->family) {
1755                         cur = sk;
1756                         goto out;
1757                 }
1758                 tp = tcp_sk(sk);
1759                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1760                 if (reqsk_queue_len(&tp->accept_queue)) {
1761 start_req:
1762                         st->uid         = sock_i_uid(sk);
1763                         st->syn_wait_sk = sk;
1764                         st->state       = TCP_SEQ_STATE_OPENREQ;
1765                         st->sbucket     = 0;
1766                         goto get_req;
1767                 }
1768                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1769         }
1770         if (++st->bucket < INET_LHTABLE_SIZE) {
1771                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1772                 goto get_sk;
1773         }
1774         cur = NULL;
1775 out:
1776         return cur;
1777 }
1778
1779 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1780 {
1781         void *rc = listening_get_next(seq, NULL);
1782
1783         while (rc && *pos) {
1784                 rc = listening_get_next(seq, rc);
1785                 --*pos;
1786         }
1787         return rc;
1788 }
1789
1790 static void *established_get_first(struct seq_file *seq)
1791 {
1792         struct tcp_iter_state* st = seq->private;
1793         void *rc = NULL;
1794
1795         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1796                 struct sock *sk;
1797                 struct hlist_node *node;
1798                 struct inet_timewait_sock *tw;
1799
1800                 /* We can reschedule _before_ having picked the target: */
1801                 cond_resched_softirq();
1802
1803                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1804                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1805                         if (sk->sk_family != st->family) {
1806                                 continue;
1807                         }
1808                         rc = sk;
1809                         goto out;
1810                 }
1811                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1812                 inet_twsk_for_each(tw, node,
1813                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1814                         if (tw->tw_family != st->family) {
1815                                 continue;
1816                         }
1817                         rc = tw;
1818                         goto out;
1819                 }
1820                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1821                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1822         }
1823 out:
1824         return rc;
1825 }
1826
1827 static void *established_get_next(struct seq_file *seq, void *cur)
1828 {
1829         struct sock *sk = cur;
1830         struct inet_timewait_sock *tw;
1831         struct hlist_node *node;
1832         struct tcp_iter_state* st = seq->private;
1833
1834         ++st->num;
1835
1836         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1837                 tw = cur;
1838                 tw = tw_next(tw);
1839 get_tw:
1840                 while (tw && tw->tw_family != st->family) {
1841                         tw = tw_next(tw);
1842                 }
1843                 if (tw) {
1844                         cur = tw;
1845                         goto out;
1846                 }
1847                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1848                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1849
1850                 /* We can reschedule between buckets: */
1851                 cond_resched_softirq();
1852
1853                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1854                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1855                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1856                 } else {
1857                         cur = NULL;
1858                         goto out;
1859                 }
1860         } else
1861                 sk = sk_next(sk);
1862
1863         sk_for_each_from(sk, node) {
1864                 if (sk->sk_family == st->family)
1865                         goto found;
1866         }
1867
1868         st->state = TCP_SEQ_STATE_TIME_WAIT;
1869         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1870         goto get_tw;
1871 found:
1872         cur = sk;
1873 out:
1874         return cur;
1875 }
1876
1877 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1878 {
1879         void *rc = established_get_first(seq);
1880
1881         while (rc && pos) {
1882                 rc = established_get_next(seq, rc);
1883                 --pos;
1884         }               
1885         return rc;
1886 }
1887
1888 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1889 {
1890         void *rc;
1891         struct tcp_iter_state* st = seq->private;
1892
1893         inet_listen_lock(&tcp_hashinfo);
1894         st->state = TCP_SEQ_STATE_LISTENING;
1895         rc        = listening_get_idx(seq, &pos);
1896
1897         if (!rc) {
1898                 inet_listen_unlock(&tcp_hashinfo);
1899                 local_bh_disable();
1900                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1901                 rc        = established_get_idx(seq, pos);
1902         }
1903
1904         return rc;
1905 }
1906
1907 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1908 {
1909         struct tcp_iter_state* st = seq->private;
1910         st->state = TCP_SEQ_STATE_LISTENING;
1911         st->num = 0;
1912         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1913 }
1914
1915 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1916 {
1917         void *rc = NULL;
1918         struct tcp_iter_state* st;
1919
1920         if (v == SEQ_START_TOKEN) {
1921                 rc = tcp_get_idx(seq, 0);
1922                 goto out;
1923         }
1924         st = seq->private;
1925
1926         switch (st->state) {
1927         case TCP_SEQ_STATE_OPENREQ:
1928         case TCP_SEQ_STATE_LISTENING:
1929                 rc = listening_get_next(seq, v);
1930                 if (!rc) {
1931                         inet_listen_unlock(&tcp_hashinfo);
1932                         local_bh_disable();
1933                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1934                         rc        = established_get_first(seq);
1935                 }
1936                 break;
1937         case TCP_SEQ_STATE_ESTABLISHED:
1938         case TCP_SEQ_STATE_TIME_WAIT:
1939                 rc = established_get_next(seq, v);
1940                 break;
1941         }
1942 out:
1943         ++*pos;
1944         return rc;
1945 }
1946
1947 static void tcp_seq_stop(struct seq_file *seq, void *v)
1948 {
1949         struct tcp_iter_state* st = seq->private;
1950
1951         switch (st->state) {
1952         case TCP_SEQ_STATE_OPENREQ:
1953                 if (v) {
1954                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
1955                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1956                 }
1957         case TCP_SEQ_STATE_LISTENING:
1958                 if (v != SEQ_START_TOKEN)
1959                         inet_listen_unlock(&tcp_hashinfo);
1960                 break;
1961         case TCP_SEQ_STATE_TIME_WAIT:
1962         case TCP_SEQ_STATE_ESTABLISHED:
1963                 if (v)
1964                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1965                 local_bh_enable();
1966                 break;
1967         }
1968 }
1969
1970 static int tcp_seq_open(struct inode *inode, struct file *file)
1971 {
1972         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1973         struct seq_file *seq;
1974         struct tcp_iter_state *s;
1975         int rc;
1976
1977         if (unlikely(afinfo == NULL))
1978                 return -EINVAL;
1979
1980         s = kmalloc(sizeof(*s), GFP_KERNEL);
1981         if (!s)
1982                 return -ENOMEM;
1983         memset(s, 0, sizeof(*s));
1984         s->family               = afinfo->family;
1985         s->seq_ops.start        = tcp_seq_start;
1986         s->seq_ops.next         = tcp_seq_next;
1987         s->seq_ops.show         = afinfo->seq_show;
1988         s->seq_ops.stop         = tcp_seq_stop;
1989
1990         rc = seq_open(file, &s->seq_ops);
1991         if (rc)
1992                 goto out_kfree;
1993         seq          = file->private_data;
1994         seq->private = s;
1995 out:
1996         return rc;
1997 out_kfree:
1998         kfree(s);
1999         goto out;
2000 }
2001
2002 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2003 {
2004         int rc = 0;
2005         struct proc_dir_entry *p;
2006
2007         if (!afinfo)
2008                 return -EINVAL;
2009         afinfo->seq_fops->owner         = afinfo->owner;
2010         afinfo->seq_fops->open          = tcp_seq_open;
2011         afinfo->seq_fops->read          = seq_read;
2012         afinfo->seq_fops->llseek        = seq_lseek;
2013         afinfo->seq_fops->release       = seq_release_private;
2014         
2015         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2016         if (p)
2017                 p->data = afinfo;
2018         else
2019                 rc = -ENOMEM;
2020         return rc;
2021 }
2022
2023 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2024 {
2025         if (!afinfo)
2026                 return;
2027         proc_net_remove(afinfo->name);
2028         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2029 }
2030
2031 static void get_openreq4(struct sock *sk, struct request_sock *req,
2032                          char *tmpbuf, int i, int uid)
2033 {
2034         const struct inet_request_sock *ireq = inet_rsk(req);
2035         int ttd = req->expires - jiffies;
2036
2037         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2038                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2039                 i,
2040                 ireq->loc_addr,
2041                 ntohs(inet_sk(sk)->sport),
2042                 ireq->rmt_addr,
2043                 ntohs(ireq->rmt_port),
2044                 TCP_SYN_RECV,
2045                 0, 0, /* could print option size, but that is af dependent. */
2046                 1,    /* timers active (only the expire timer) */
2047                 jiffies_to_clock_t(ttd),
2048                 req->retrans,
2049                 uid,
2050                 0,  /* non standard timer */
2051                 0, /* open_requests have no inode */
2052                 atomic_read(&sk->sk_refcnt),
2053                 req);
2054 }
2055
2056 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2057 {
2058         int timer_active;
2059         unsigned long timer_expires;
2060         struct tcp_sock *tp = tcp_sk(sp);
2061         struct inet_sock *inet = inet_sk(sp);
2062         unsigned int dest = inet->daddr;
2063         unsigned int src = inet->rcv_saddr;
2064         __u16 destp = ntohs(inet->dport);
2065         __u16 srcp = ntohs(inet->sport);
2066
2067         if (tp->pending == TCP_TIME_RETRANS) {
2068                 timer_active    = 1;
2069                 timer_expires   = tp->timeout;
2070         } else if (tp->pending == TCP_TIME_PROBE0) {
2071                 timer_active    = 4;
2072                 timer_expires   = tp->timeout;
2073         } else if (timer_pending(&sp->sk_timer)) {
2074                 timer_active    = 2;
2075                 timer_expires   = sp->sk_timer.expires;
2076         } else {
2077                 timer_active    = 0;
2078                 timer_expires = jiffies;
2079         }
2080
2081         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2082                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2083                 i, src, srcp, dest, destp, sp->sk_state,
2084                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2085                 timer_active,
2086                 jiffies_to_clock_t(timer_expires - jiffies),
2087                 tp->retransmits,
2088                 sock_i_uid(sp),
2089                 tp->probes_out,
2090                 sock_i_ino(sp),
2091                 atomic_read(&sp->sk_refcnt), sp,
2092                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2093                 tp->snd_cwnd,
2094                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2095 }
2096
2097 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
2098 {
2099         unsigned int dest, src;
2100         __u16 destp, srcp;
2101         int ttd = tw->tw_ttd - jiffies;
2102
2103         if (ttd < 0)
2104                 ttd = 0;
2105
2106         dest  = tw->tw_daddr;
2107         src   = tw->tw_rcv_saddr;
2108         destp = ntohs(tw->tw_dport);
2109         srcp  = ntohs(tw->tw_sport);
2110
2111         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2112                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2113                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2114                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2115                 atomic_read(&tw->tw_refcnt), tw);
2116 }
2117
2118 #define TMPSZ 150
2119
2120 static int tcp4_seq_show(struct seq_file *seq, void *v)
2121 {
2122         struct tcp_iter_state* st;
2123         char tmpbuf[TMPSZ + 1];
2124
2125         if (v == SEQ_START_TOKEN) {
2126                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2127                            "  sl  local_address rem_address   st tx_queue "
2128                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2129                            "inode");
2130                 goto out;
2131         }
2132         st = seq->private;
2133
2134         switch (st->state) {
2135         case TCP_SEQ_STATE_LISTENING:
2136         case TCP_SEQ_STATE_ESTABLISHED:
2137                 get_tcp4_sock(v, tmpbuf, st->num);
2138                 break;
2139         case TCP_SEQ_STATE_OPENREQ:
2140                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2141                 break;
2142         case TCP_SEQ_STATE_TIME_WAIT:
2143                 get_timewait4_sock(v, tmpbuf, st->num);
2144                 break;
2145         }
2146         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2147 out:
2148         return 0;
2149 }
2150
2151 static struct file_operations tcp4_seq_fops;
2152 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2153         .owner          = THIS_MODULE,
2154         .name           = "tcp",
2155         .family         = AF_INET,
2156         .seq_show       = tcp4_seq_show,
2157         .seq_fops       = &tcp4_seq_fops,
2158 };
2159
2160 int __init tcp4_proc_init(void)
2161 {
2162         return tcp_proc_register(&tcp4_seq_afinfo);
2163 }
2164
2165 void tcp4_proc_exit(void)
2166 {
2167         tcp_proc_unregister(&tcp4_seq_afinfo);
2168 }
2169 #endif /* CONFIG_PROC_FS */
2170
2171 struct proto tcp_prot = {
2172         .name                   = "TCP",
2173         .owner                  = THIS_MODULE,
2174         .close                  = tcp_close,
2175         .connect                = tcp_v4_connect,
2176         .disconnect             = tcp_disconnect,
2177         .accept                 = tcp_accept,
2178         .ioctl                  = tcp_ioctl,
2179         .init                   = tcp_v4_init_sock,
2180         .destroy                = tcp_v4_destroy_sock,
2181         .shutdown               = tcp_shutdown,
2182         .setsockopt             = tcp_setsockopt,
2183         .getsockopt             = tcp_getsockopt,
2184         .sendmsg                = tcp_sendmsg,
2185         .recvmsg                = tcp_recvmsg,
2186         .backlog_rcv            = tcp_v4_do_rcv,
2187         .hash                   = tcp_v4_hash,
2188         .unhash                 = tcp_unhash,
2189         .get_port               = tcp_v4_get_port,
2190         .enter_memory_pressure  = tcp_enter_memory_pressure,
2191         .sockets_allocated      = &tcp_sockets_allocated,
2192         .memory_allocated       = &tcp_memory_allocated,
2193         .memory_pressure        = &tcp_memory_pressure,
2194         .sysctl_mem             = sysctl_tcp_mem,
2195         .sysctl_wmem            = sysctl_tcp_wmem,
2196         .sysctl_rmem            = sysctl_tcp_rmem,
2197         .max_header             = MAX_TCP_HEADER,
2198         .obj_size               = sizeof(struct tcp_sock),
2199         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
2200         .rsk_prot               = &tcp_request_sock_ops,
2201 };
2202
2203
2204
2205 void __init tcp_v4_init(struct net_proto_family *ops)
2206 {
2207         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2208         if (err < 0)
2209                 panic("Failed to create the TCP control socket.\n");
2210         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2211         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2212
2213         /* Unhash it so that IP input processing does not even
2214          * see it, we do not wish this socket to see incoming
2215          * packets.
2216          */
2217         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2218 }
2219
2220 EXPORT_SYMBOL(ipv4_specific);
2221 EXPORT_SYMBOL(inet_bind_bucket_create);
2222 EXPORT_SYMBOL(tcp_hashinfo);
2223 EXPORT_SYMBOL(tcp_prot);
2224 EXPORT_SYMBOL(tcp_unhash);
2225 EXPORT_SYMBOL(tcp_v4_conn_request);
2226 EXPORT_SYMBOL(tcp_v4_connect);
2227 EXPORT_SYMBOL(tcp_v4_do_rcv);
2228 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2229 EXPORT_SYMBOL(tcp_v4_send_check);
2230 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2231
2232 #ifdef CONFIG_PROC_FS
2233 EXPORT_SYMBOL(tcp_proc_register);
2234 EXPORT_SYMBOL(tcp_proc_unregister);
2235 #endif
2236 EXPORT_SYMBOL(sysctl_local_port_range);
2237 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2238 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2239