Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
90
91 /* Socket used for sending RSTs */
92 static struct socket *tcp_socket __read_mostly;
93
94 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
95
96 #ifdef CONFIG_TCP_MD5SIG
97 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
98                                                    __be32 addr);
99 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
100                                    __be32 saddr, __be32 daddr,
101                                    struct tcphdr *th, int protocol,
102                                    int tcplen);
103 #endif
104
105 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
106         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
107         .lhash_users = ATOMIC_INIT(0),
108         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
109 };
110
111 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
112 {
113         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
114                                  inet_csk_bind_conflict);
115 }
116
117 static void tcp_v4_hash(struct sock *sk)
118 {
119         inet_hash(&tcp_hashinfo, sk);
120 }
121
122 void tcp_unhash(struct sock *sk)
123 {
124         inet_unhash(&tcp_hashinfo, sk);
125 }
126
127 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
128 {
129         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
130                                           ip_hdr(skb)->saddr,
131                                           tcp_hdr(skb)->dest,
132                                           tcp_hdr(skb)->source);
133 }
134
135 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
136 {
137         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
138         struct tcp_sock *tp = tcp_sk(sk);
139
140         /* With PAWS, it is safe from the viewpoint
141            of data integrity. Even without PAWS it is safe provided sequence
142            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144            Actually, the idea is close to VJ's one, only timestamp cache is
145            held not per host, but per port pair and TW bucket is used as state
146            holder.
147
148            If TW bucket has been already destroyed we fall back to VJ's scheme
149            and use initial timestamp retrieved from peer table.
150          */
151         if (tcptw->tw_ts_recent_stamp &&
152             (twp == NULL || (sysctl_tcp_tw_reuse &&
153                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
154                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
155                 if (tp->write_seq == 0)
156                         tp->write_seq = 1;
157                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
158                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
159                 sock_hold(sktw);
160                 return 1;
161         }
162
163         return 0;
164 }
165
166 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
167
168 /* This will initiate an outgoing connection. */
169 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
170 {
171         struct inet_sock *inet = inet_sk(sk);
172         struct tcp_sock *tp = tcp_sk(sk);
173         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
174         struct rtable *rt;
175         __be32 daddr, nexthop;
176         int tmp;
177         int err;
178
179         if (addr_len < sizeof(struct sockaddr_in))
180                 return -EINVAL;
181
182         if (usin->sin_family != AF_INET)
183                 return -EAFNOSUPPORT;
184
185         nexthop = daddr = usin->sin_addr.s_addr;
186         if (inet->opt && inet->opt->srr) {
187                 if (!daddr)
188                         return -EINVAL;
189                 nexthop = inet->opt->faddr;
190         }
191
192         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
193                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
194                                IPPROTO_TCP,
195                                inet->sport, usin->sin_port, sk, 1);
196         if (tmp < 0) {
197                 if (tmp == -ENETUNREACH)
198                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
199                 return tmp;
200         }
201
202         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
203                 ip_rt_put(rt);
204                 return -ENETUNREACH;
205         }
206
207         if (!inet->opt || !inet->opt->srr)
208                 daddr = rt->rt_dst;
209
210         if (!inet->saddr)
211                 inet->saddr = rt->rt_src;
212         inet->rcv_saddr = inet->saddr;
213
214         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
215                 /* Reset inherited state */
216                 tp->rx_opt.ts_recent       = 0;
217                 tp->rx_opt.ts_recent_stamp = 0;
218                 tp->write_seq              = 0;
219         }
220
221         if (tcp_death_row.sysctl_tw_recycle &&
222             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
223                 struct inet_peer *peer = rt_get_peer(rt);
224                 /*
225                  * VJ's idea. We save last timestamp seen from
226                  * the destination in peer table, when entering state
227                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
228                  * when trying new connection.
229                  */
230                 if (peer != NULL &&
231                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
232                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
233                         tp->rx_opt.ts_recent = peer->tcp_ts;
234                 }
235         }
236
237         inet->dport = usin->sin_port;
238         inet->daddr = daddr;
239
240         inet_csk(sk)->icsk_ext_hdr_len = 0;
241         if (inet->opt)
242                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
243
244         tp->rx_opt.mss_clamp = 536;
245
246         /* Socket identity is still unknown (sport may be zero).
247          * However we set state to SYN-SENT and not releasing socket
248          * lock select source port, enter ourselves into the hash tables and
249          * complete initialization after this.
250          */
251         tcp_set_state(sk, TCP_SYN_SENT);
252         err = inet_hash_connect(&tcp_death_row, sk);
253         if (err)
254                 goto failure;
255
256         err = ip_route_newports(&rt, IPPROTO_TCP,
257                                 inet->sport, inet->dport, sk);
258         if (err)
259                 goto failure;
260
261         /* OK, now commit destination to socket.  */
262         sk->sk_gso_type = SKB_GSO_TCPV4;
263         sk_setup_caps(sk, &rt->u.dst);
264
265         if (!tp->write_seq)
266                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
267                                                            inet->daddr,
268                                                            inet->sport,
269                                                            usin->sin_port);
270
271         inet->id = tp->write_seq ^ jiffies;
272
273         err = tcp_connect(sk);
274         rt = NULL;
275         if (err)
276                 goto failure;
277
278         return 0;
279
280 failure:
281         /*
282          * This unhashes the socket and releases the local port,
283          * if necessary.
284          */
285         tcp_set_state(sk, TCP_CLOSE);
286         ip_rt_put(rt);
287         sk->sk_route_caps = 0;
288         inet->dport = 0;
289         return err;
290 }
291
292 /*
293  * This routine does path mtu discovery as defined in RFC1191.
294  */
295 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
296 {
297         struct dst_entry *dst;
298         struct inet_sock *inet = inet_sk(sk);
299
300         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
301          * send out by Linux are always <576bytes so they should go through
302          * unfragmented).
303          */
304         if (sk->sk_state == TCP_LISTEN)
305                 return;
306
307         /* We don't check in the destentry if pmtu discovery is forbidden
308          * on this route. We just assume that no packet_to_big packets
309          * are send back when pmtu discovery is not active.
310          * There is a small race when the user changes this flag in the
311          * route, but I think that's acceptable.
312          */
313         if ((dst = __sk_dst_check(sk, 0)) == NULL)
314                 return;
315
316         dst->ops->update_pmtu(dst, mtu);
317
318         /* Something is about to be wrong... Remember soft error
319          * for the case, if this connection will not able to recover.
320          */
321         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
322                 sk->sk_err_soft = EMSGSIZE;
323
324         mtu = dst_mtu(dst);
325
326         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
327             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
328                 tcp_sync_mss(sk, mtu);
329
330                 /* Resend the TCP packet because it's
331                  * clear that the old packet has been
332                  * dropped. This is the new "fast" path mtu
333                  * discovery.
334                  */
335                 tcp_simple_retransmit(sk);
336         } /* else let the usual retransmit timer handle it */
337 }
338
339 /*
340  * This routine is called by the ICMP module when it gets some
341  * sort of error condition.  If err < 0 then the socket should
342  * be closed and the error returned to the user.  If err > 0
343  * it's just the icmp type << 8 | icmp code.  After adjustment
344  * header points to the first 8 bytes of the tcp header.  We need
345  * to find the appropriate port.
346  *
347  * The locking strategy used here is very "optimistic". When
348  * someone else accesses the socket the ICMP is just dropped
349  * and for some paths there is no check at all.
350  * A more general error queue to queue errors for later handling
351  * is probably better.
352  *
353  */
354
355 void tcp_v4_err(struct sk_buff *skb, u32 info)
356 {
357         struct iphdr *iph = (struct iphdr *)skb->data;
358         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
359         struct tcp_sock *tp;
360         struct inet_sock *inet;
361         const int type = icmp_hdr(skb)->type;
362         const int code = icmp_hdr(skb)->code;
363         struct sock *sk;
364         __u32 seq;
365         int err;
366
367         if (skb->len < (iph->ihl << 2) + 8) {
368                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
369                 return;
370         }
371
372         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
373                          th->source, inet_iif(skb));
374         if (!sk) {
375                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
376                 return;
377         }
378         if (sk->sk_state == TCP_TIME_WAIT) {
379                 inet_twsk_put(inet_twsk(sk));
380                 return;
381         }
382
383         bh_lock_sock(sk);
384         /* If too many ICMPs get dropped on busy
385          * servers this needs to be solved differently.
386          */
387         if (sock_owned_by_user(sk))
388                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
389
390         if (sk->sk_state == TCP_CLOSE)
391                 goto out;
392
393         tp = tcp_sk(sk);
394         seq = ntohl(th->seq);
395         if (sk->sk_state != TCP_LISTEN &&
396             !between(seq, tp->snd_una, tp->snd_nxt)) {
397                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
398                 goto out;
399         }
400
401         switch (type) {
402         case ICMP_SOURCE_QUENCH:
403                 /* Just silently ignore these. */
404                 goto out;
405         case ICMP_PARAMETERPROB:
406                 err = EPROTO;
407                 break;
408         case ICMP_DEST_UNREACH:
409                 if (code > NR_ICMP_UNREACH)
410                         goto out;
411
412                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
413                         if (!sock_owned_by_user(sk))
414                                 do_pmtu_discovery(sk, iph, info);
415                         goto out;
416                 }
417
418                 err = icmp_err_convert[code].errno;
419                 break;
420         case ICMP_TIME_EXCEEDED:
421                 err = EHOSTUNREACH;
422                 break;
423         default:
424                 goto out;
425         }
426
427         switch (sk->sk_state) {
428                 struct request_sock *req, **prev;
429         case TCP_LISTEN:
430                 if (sock_owned_by_user(sk))
431                         goto out;
432
433                 req = inet_csk_search_req(sk, &prev, th->dest,
434                                           iph->daddr, iph->saddr);
435                 if (!req)
436                         goto out;
437
438                 /* ICMPs are not backlogged, hence we cannot get
439                    an established socket here.
440                  */
441                 BUG_TRAP(!req->sk);
442
443                 if (seq != tcp_rsk(req)->snt_isn) {
444                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
445                         goto out;
446                 }
447
448                 /*
449                  * Still in SYN_RECV, just remove it silently.
450                  * There is no good way to pass the error to the newly
451                  * created socket, and POSIX does not want network
452                  * errors returned from accept().
453                  */
454                 inet_csk_reqsk_queue_drop(sk, req, prev);
455                 goto out;
456
457         case TCP_SYN_SENT:
458         case TCP_SYN_RECV:  /* Cannot happen.
459                                It can f.e. if SYNs crossed.
460                              */
461                 if (!sock_owned_by_user(sk)) {
462                         sk->sk_err = err;
463
464                         sk->sk_error_report(sk);
465
466                         tcp_done(sk);
467                 } else {
468                         sk->sk_err_soft = err;
469                 }
470                 goto out;
471         }
472
473         /* If we've already connected we will keep trying
474          * until we time out, or the user gives up.
475          *
476          * rfc1122 4.2.3.9 allows to consider as hard errors
477          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
478          * but it is obsoleted by pmtu discovery).
479          *
480          * Note, that in modern internet, where routing is unreliable
481          * and in each dark corner broken firewalls sit, sending random
482          * errors ordered by their masters even this two messages finally lose
483          * their original sense (even Linux sends invalid PORT_UNREACHs)
484          *
485          * Now we are in compliance with RFCs.
486          *                                                      --ANK (980905)
487          */
488
489         inet = inet_sk(sk);
490         if (!sock_owned_by_user(sk) && inet->recverr) {
491                 sk->sk_err = err;
492                 sk->sk_error_report(sk);
493         } else  { /* Only an error on timeout */
494                 sk->sk_err_soft = err;
495         }
496
497 out:
498         bh_unlock_sock(sk);
499         sock_put(sk);
500 }
501
502 /* This routine computes an IPv4 TCP checksum. */
503 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
504 {
505         struct inet_sock *inet = inet_sk(sk);
506         struct tcphdr *th = tcp_hdr(skb);
507
508         if (skb->ip_summed == CHECKSUM_PARTIAL) {
509                 th->check = ~tcp_v4_check(len, inet->saddr,
510                                           inet->daddr, 0);
511                 skb->csum_start = skb_transport_header(skb) - skb->head;
512                 skb->csum_offset = offsetof(struct tcphdr, check);
513         } else {
514                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
515                                          csum_partial((char *)th,
516                                                       th->doff << 2,
517                                                       skb->csum));
518         }
519 }
520
521 int tcp_v4_gso_send_check(struct sk_buff *skb)
522 {
523         const struct iphdr *iph;
524         struct tcphdr *th;
525
526         if (!pskb_may_pull(skb, sizeof(*th)))
527                 return -EINVAL;
528
529         iph = ip_hdr(skb);
530         th = tcp_hdr(skb);
531
532         th->check = 0;
533         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
534         skb->csum_start = skb_transport_header(skb) - skb->head;
535         skb->csum_offset = offsetof(struct tcphdr, check);
536         skb->ip_summed = CHECKSUM_PARTIAL;
537         return 0;
538 }
539
540 /*
541  *      This routine will send an RST to the other tcp.
542  *
543  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
544  *                    for reset.
545  *      Answer: if a packet caused RST, it is not for a socket
546  *              existing in our system, if it is matched to a socket,
547  *              it is just duplicate segment or bug in other side's TCP.
548  *              So that we build reply only basing on parameters
549  *              arrived with segment.
550  *      Exception: precedence violation. We do not implement it in any case.
551  */
552
553 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
554 {
555         struct tcphdr *th = tcp_hdr(skb);
556         struct {
557                 struct tcphdr th;
558 #ifdef CONFIG_TCP_MD5SIG
559                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
560 #endif
561         } rep;
562         struct ip_reply_arg arg;
563 #ifdef CONFIG_TCP_MD5SIG
564         struct tcp_md5sig_key *key;
565 #endif
566
567         /* Never send a reset in response to a reset. */
568         if (th->rst)
569                 return;
570
571         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
572                 return;
573
574         /* Swap the send and the receive. */
575         memset(&rep, 0, sizeof(rep));
576         rep.th.dest   = th->source;
577         rep.th.source = th->dest;
578         rep.th.doff   = sizeof(struct tcphdr) / 4;
579         rep.th.rst    = 1;
580
581         if (th->ack) {
582                 rep.th.seq = th->ack_seq;
583         } else {
584                 rep.th.ack = 1;
585                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
586                                        skb->len - (th->doff << 2));
587         }
588
589         memset(&arg, 0, sizeof(arg));
590         arg.iov[0].iov_base = (unsigned char *)&rep;
591         arg.iov[0].iov_len  = sizeof(rep.th);
592
593 #ifdef CONFIG_TCP_MD5SIG
594         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
595         if (key) {
596                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
597                                    (TCPOPT_NOP << 16) |
598                                    (TCPOPT_MD5SIG << 8) |
599                                    TCPOLEN_MD5SIG);
600                 /* Update length and the length the header thinks exists */
601                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
602                 rep.th.doff = arg.iov[0].iov_len / 4;
603
604                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
605                                         key,
606                                         ip_hdr(skb)->daddr,
607                                         ip_hdr(skb)->saddr,
608                                         &rep.th, IPPROTO_TCP,
609                                         arg.iov[0].iov_len);
610         }
611 #endif
612         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
613                                       ip_hdr(skb)->saddr, /* XXX */
614                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
615         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
616
617         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
618
619         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
620         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
621 }
622
623 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
624    outside socket context is ugly, certainly. What can I do?
625  */
626
627 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
628                             struct sk_buff *skb, u32 seq, u32 ack,
629                             u32 win, u32 ts)
630 {
631         struct tcphdr *th = tcp_hdr(skb);
632         struct {
633                 struct tcphdr th;
634                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
635 #ifdef CONFIG_TCP_MD5SIG
636                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
637 #endif
638                         ];
639         } rep;
640         struct ip_reply_arg arg;
641 #ifdef CONFIG_TCP_MD5SIG
642         struct tcp_md5sig_key *key;
643         struct tcp_md5sig_key tw_key;
644 #endif
645
646         memset(&rep.th, 0, sizeof(struct tcphdr));
647         memset(&arg, 0, sizeof(arg));
648
649         arg.iov[0].iov_base = (unsigned char *)&rep;
650         arg.iov[0].iov_len  = sizeof(rep.th);
651         if (ts) {
652                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
653                                    (TCPOPT_TIMESTAMP << 8) |
654                                    TCPOLEN_TIMESTAMP);
655                 rep.opt[1] = htonl(tcp_time_stamp);
656                 rep.opt[2] = htonl(ts);
657                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
658         }
659
660         /* Swap the send and the receive. */
661         rep.th.dest    = th->source;
662         rep.th.source  = th->dest;
663         rep.th.doff    = arg.iov[0].iov_len / 4;
664         rep.th.seq     = htonl(seq);
665         rep.th.ack_seq = htonl(ack);
666         rep.th.ack     = 1;
667         rep.th.window  = htons(win);
668
669 #ifdef CONFIG_TCP_MD5SIG
670         /*
671          * The SKB holds an imcoming packet, but may not have a valid ->sk
672          * pointer. This is especially the case when we're dealing with a
673          * TIME_WAIT ack, because the sk structure is long gone, and only
674          * the tcp_timewait_sock remains. So the md5 key is stashed in that
675          * structure, and we use it in preference.  I believe that (twsk ||
676          * skb->sk) holds true, but we program defensively.
677          */
678         if (!twsk && skb->sk) {
679                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
680         } else if (twsk && twsk->tw_md5_keylen) {
681                 tw_key.key = twsk->tw_md5_key;
682                 tw_key.keylen = twsk->tw_md5_keylen;
683                 key = &tw_key;
684         } else
685                 key = NULL;
686
687         if (key) {
688                 int offset = (ts) ? 3 : 0;
689
690                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
691                                           (TCPOPT_NOP << 16) |
692                                           (TCPOPT_MD5SIG << 8) |
693                                           TCPOLEN_MD5SIG);
694                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
695                 rep.th.doff = arg.iov[0].iov_len/4;
696
697                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
698                                         key,
699                                         ip_hdr(skb)->daddr,
700                                         ip_hdr(skb)->saddr,
701                                         &rep.th, IPPROTO_TCP,
702                                         arg.iov[0].iov_len);
703         }
704 #endif
705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706                                       ip_hdr(skb)->saddr, /* XXX */
707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
709         if (twsk)
710                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
711
712         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
713
714         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
715 }
716
717 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
718 {
719         struct inet_timewait_sock *tw = inet_twsk(sk);
720         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
721
722         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
723                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
724                         tcptw->tw_ts_recent);
725
726         inet_twsk_put(tw);
727 }
728
729 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
730                                   struct request_sock *req)
731 {
732         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
733                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
734                         req->ts_recent);
735 }
736
737 /*
738  *      Send a SYN-ACK after having received an ACK.
739  *      This still operates on a request_sock only, not on a big
740  *      socket.
741  */
742 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
743                               struct dst_entry *dst)
744 {
745         const struct inet_request_sock *ireq = inet_rsk(req);
746         int err = -1;
747         struct sk_buff * skb;
748
749         /* First, grab a route. */
750         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
751                 goto out;
752
753         skb = tcp_make_synack(sk, dst, req);
754
755         if (skb) {
756                 struct tcphdr *th = tcp_hdr(skb);
757
758                 th->check = tcp_v4_check(skb->len,
759                                          ireq->loc_addr,
760                                          ireq->rmt_addr,
761                                          csum_partial((char *)th, skb->len,
762                                                       skb->csum));
763
764                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
765                                             ireq->rmt_addr,
766                                             ireq->opt);
767                 err = net_xmit_eval(err);
768         }
769
770 out:
771         dst_release(dst);
772         return err;
773 }
774
775 /*
776  *      IPv4 request_sock destructor.
777  */
778 static void tcp_v4_reqsk_destructor(struct request_sock *req)
779 {
780         kfree(inet_rsk(req)->opt);
781 }
782
783 #ifdef CONFIG_SYN_COOKIES
784 static void syn_flood_warning(struct sk_buff *skb)
785 {
786         static unsigned long warntime;
787
788         if (time_after(jiffies, (warntime + HZ * 60))) {
789                 warntime = jiffies;
790                 printk(KERN_INFO
791                        "possible SYN flooding on port %d. Sending cookies.\n",
792                        ntohs(tcp_hdr(skb)->dest));
793         }
794 }
795 #endif
796
797 /*
798  * Save and compile IPv4 options into the request_sock if needed.
799  */
800 static struct ip_options *tcp_v4_save_options(struct sock *sk,
801                                               struct sk_buff *skb)
802 {
803         struct ip_options *opt = &(IPCB(skb)->opt);
804         struct ip_options *dopt = NULL;
805
806         if (opt && opt->optlen) {
807                 int opt_size = optlength(opt);
808                 dopt = kmalloc(opt_size, GFP_ATOMIC);
809                 if (dopt) {
810                         if (ip_options_echo(dopt, skb)) {
811                                 kfree(dopt);
812                                 dopt = NULL;
813                         }
814                 }
815         }
816         return dopt;
817 }
818
819 #ifdef CONFIG_TCP_MD5SIG
820 /*
821  * RFC2385 MD5 checksumming requires a mapping of
822  * IP address->MD5 Key.
823  * We need to maintain these in the sk structure.
824  */
825
826 /* Find the Key structure for an address.  */
827 static struct tcp_md5sig_key *
828                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
829 {
830         struct tcp_sock *tp = tcp_sk(sk);
831         int i;
832
833         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
834                 return NULL;
835         for (i = 0; i < tp->md5sig_info->entries4; i++) {
836                 if (tp->md5sig_info->keys4[i].addr == addr)
837                         return &tp->md5sig_info->keys4[i].base;
838         }
839         return NULL;
840 }
841
842 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
843                                          struct sock *addr_sk)
844 {
845         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
846 }
847
848 EXPORT_SYMBOL(tcp_v4_md5_lookup);
849
850 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
851                                                       struct request_sock *req)
852 {
853         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
854 }
855
856 /* This can be called on a newly created socket, from other files */
857 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
858                       u8 *newkey, u8 newkeylen)
859 {
860         /* Add Key to the list */
861         struct tcp_md5sig_key *key;
862         struct tcp_sock *tp = tcp_sk(sk);
863         struct tcp4_md5sig_key *keys;
864
865         key = tcp_v4_md5_do_lookup(sk, addr);
866         if (key) {
867                 /* Pre-existing entry - just update that one. */
868                 kfree(key->key);
869                 key->key = newkey;
870                 key->keylen = newkeylen;
871         } else {
872                 struct tcp_md5sig_info *md5sig;
873
874                 if (!tp->md5sig_info) {
875                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
876                                                   GFP_ATOMIC);
877                         if (!tp->md5sig_info) {
878                                 kfree(newkey);
879                                 return -ENOMEM;
880                         }
881                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
882                 }
883                 if (tcp_alloc_md5sig_pool() == NULL) {
884                         kfree(newkey);
885                         return -ENOMEM;
886                 }
887                 md5sig = tp->md5sig_info;
888
889                 if (md5sig->alloced4 == md5sig->entries4) {
890                         keys = kmalloc((sizeof(*keys) *
891                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
892                         if (!keys) {
893                                 kfree(newkey);
894                                 tcp_free_md5sig_pool();
895                                 return -ENOMEM;
896                         }
897
898                         if (md5sig->entries4)
899                                 memcpy(keys, md5sig->keys4,
900                                        sizeof(*keys) * md5sig->entries4);
901
902                         /* Free old key list, and reference new one */
903                         kfree(md5sig->keys4);
904                         md5sig->keys4 = keys;
905                         md5sig->alloced4++;
906                 }
907                 md5sig->entries4++;
908                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
909                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
910                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
911         }
912         return 0;
913 }
914
915 EXPORT_SYMBOL(tcp_v4_md5_do_add);
916
917 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
918                                u8 *newkey, u8 newkeylen)
919 {
920         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
921                                  newkey, newkeylen);
922 }
923
924 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
925 {
926         struct tcp_sock *tp = tcp_sk(sk);
927         int i;
928
929         for (i = 0; i < tp->md5sig_info->entries4; i++) {
930                 if (tp->md5sig_info->keys4[i].addr == addr) {
931                         /* Free the key */
932                         kfree(tp->md5sig_info->keys4[i].base.key);
933                         tp->md5sig_info->entries4--;
934
935                         if (tp->md5sig_info->entries4 == 0) {
936                                 kfree(tp->md5sig_info->keys4);
937                                 tp->md5sig_info->keys4 = NULL;
938                                 tp->md5sig_info->alloced4 = 0;
939                         } else if (tp->md5sig_info->entries4 != i) {
940                                 /* Need to do some manipulation */
941                                 memmove(&tp->md5sig_info->keys4[i],
942                                         &tp->md5sig_info->keys4[i+1],
943                                         (tp->md5sig_info->entries4 - i) *
944                                          sizeof(struct tcp4_md5sig_key));
945                         }
946                         tcp_free_md5sig_pool();
947                         return 0;
948                 }
949         }
950         return -ENOENT;
951 }
952
953 EXPORT_SYMBOL(tcp_v4_md5_do_del);
954
955 static void tcp_v4_clear_md5_list(struct sock *sk)
956 {
957         struct tcp_sock *tp = tcp_sk(sk);
958
959         /* Free each key, then the set of key keys,
960          * the crypto element, and then decrement our
961          * hold on the last resort crypto.
962          */
963         if (tp->md5sig_info->entries4) {
964                 int i;
965                 for (i = 0; i < tp->md5sig_info->entries4; i++)
966                         kfree(tp->md5sig_info->keys4[i].base.key);
967                 tp->md5sig_info->entries4 = 0;
968                 tcp_free_md5sig_pool();
969         }
970         if (tp->md5sig_info->keys4) {
971                 kfree(tp->md5sig_info->keys4);
972                 tp->md5sig_info->keys4 = NULL;
973                 tp->md5sig_info->alloced4  = 0;
974         }
975 }
976
977 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
978                                  int optlen)
979 {
980         struct tcp_md5sig cmd;
981         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
982         u8 *newkey;
983
984         if (optlen < sizeof(cmd))
985                 return -EINVAL;
986
987         if (copy_from_user(&cmd, optval, sizeof(cmd)))
988                 return -EFAULT;
989
990         if (sin->sin_family != AF_INET)
991                 return -EINVAL;
992
993         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
994                 if (!tcp_sk(sk)->md5sig_info)
995                         return -ENOENT;
996                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
997         }
998
999         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1000                 return -EINVAL;
1001
1002         if (!tcp_sk(sk)->md5sig_info) {
1003                 struct tcp_sock *tp = tcp_sk(sk);
1004                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1005
1006                 if (!p)
1007                         return -EINVAL;
1008
1009                 tp->md5sig_info = p;
1010                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1011         }
1012
1013         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1014         if (!newkey)
1015                 return -ENOMEM;
1016         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1017                                  newkey, cmd.tcpm_keylen);
1018 }
1019
1020 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1021                                    __be32 saddr, __be32 daddr,
1022                                    struct tcphdr *th, int protocol,
1023                                    int tcplen)
1024 {
1025         struct scatterlist sg[4];
1026         __u16 data_len;
1027         int block = 0;
1028         __sum16 old_checksum;
1029         struct tcp_md5sig_pool *hp;
1030         struct tcp4_pseudohdr *bp;
1031         struct hash_desc *desc;
1032         int err;
1033         unsigned int nbytes = 0;
1034
1035         /*
1036          * Okay, so RFC2385 is turned on for this connection,
1037          * so we need to generate the MD5 hash for the packet now.
1038          */
1039
1040         hp = tcp_get_md5sig_pool();
1041         if (!hp)
1042                 goto clear_hash_noput;
1043
1044         bp = &hp->md5_blk.ip4;
1045         desc = &hp->md5_desc;
1046
1047         /*
1048          * 1. the TCP pseudo-header (in the order: source IP address,
1049          * destination IP address, zero-padded protocol number, and
1050          * segment length)
1051          */
1052         bp->saddr = saddr;
1053         bp->daddr = daddr;
1054         bp->pad = 0;
1055         bp->protocol = protocol;
1056         bp->len = htons(tcplen);
1057
1058         sg_init_table(sg, 4);
1059
1060         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1061         nbytes += sizeof(*bp);
1062
1063         /* 2. the TCP header, excluding options, and assuming a
1064          * checksum of zero/
1065          */
1066         old_checksum = th->check;
1067         th->check = 0;
1068         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1069         nbytes += sizeof(struct tcphdr);
1070
1071         /* 3. the TCP segment data (if any) */
1072         data_len = tcplen - (th->doff << 2);
1073         if (data_len > 0) {
1074                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1075                 sg_set_buf(&sg[block++], data, data_len);
1076                 nbytes += data_len;
1077         }
1078
1079         /* 4. an independently-specified key or password, known to both
1080          * TCPs and presumably connection-specific
1081          */
1082         sg_set_buf(&sg[block++], key->key, key->keylen);
1083         nbytes += key->keylen;
1084
1085         sg_mark_end(&sg[block - 1]);
1086
1087         /* Now store the Hash into the packet */
1088         err = crypto_hash_init(desc);
1089         if (err)
1090                 goto clear_hash;
1091         err = crypto_hash_update(desc, sg, nbytes);
1092         if (err)
1093                 goto clear_hash;
1094         err = crypto_hash_final(desc, md5_hash);
1095         if (err)
1096                 goto clear_hash;
1097
1098         /* Reset header, and free up the crypto */
1099         tcp_put_md5sig_pool();
1100         th->check = old_checksum;
1101
1102 out:
1103         return 0;
1104 clear_hash:
1105         tcp_put_md5sig_pool();
1106 clear_hash_noput:
1107         memset(md5_hash, 0, 16);
1108         goto out;
1109 }
1110
1111 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1112                          struct sock *sk,
1113                          struct dst_entry *dst,
1114                          struct request_sock *req,
1115                          struct tcphdr *th, int protocol,
1116                          int tcplen)
1117 {
1118         __be32 saddr, daddr;
1119
1120         if (sk) {
1121                 saddr = inet_sk(sk)->saddr;
1122                 daddr = inet_sk(sk)->daddr;
1123         } else {
1124                 struct rtable *rt = (struct rtable *)dst;
1125                 BUG_ON(!rt);
1126                 saddr = rt->rt_src;
1127                 daddr = rt->rt_dst;
1128         }
1129         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1130                                        saddr, daddr,
1131                                        th, protocol, tcplen);
1132 }
1133
1134 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1135
1136 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1137 {
1138         /*
1139          * This gets called for each TCP segment that arrives
1140          * so we want to be efficient.
1141          * We have 3 drop cases:
1142          * o No MD5 hash and one expected.
1143          * o MD5 hash and we're not expecting one.
1144          * o MD5 hash and its wrong.
1145          */
1146         __u8 *hash_location = NULL;
1147         struct tcp_md5sig_key *hash_expected;
1148         const struct iphdr *iph = ip_hdr(skb);
1149         struct tcphdr *th = tcp_hdr(skb);
1150         int length = (th->doff << 2) - sizeof(struct tcphdr);
1151         int genhash;
1152         unsigned char *ptr;
1153         unsigned char newhash[16];
1154
1155         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1156
1157         /*
1158          * If the TCP option length is less than the TCP_MD5SIG
1159          * option length, then we can shortcut
1160          */
1161         if (length < TCPOLEN_MD5SIG) {
1162                 if (hash_expected)
1163                         return 1;
1164                 else
1165                         return 0;
1166         }
1167
1168         /* Okay, we can't shortcut - we have to grub through the options */
1169         ptr = (unsigned char *)(th + 1);
1170         while (length > 0) {
1171                 int opcode = *ptr++;
1172                 int opsize;
1173
1174                 switch (opcode) {
1175                 case TCPOPT_EOL:
1176                         goto done_opts;
1177                 case TCPOPT_NOP:
1178                         length--;
1179                         continue;
1180                 default:
1181                         opsize = *ptr++;
1182                         if (opsize < 2)
1183                                 goto done_opts;
1184                         if (opsize > length)
1185                                 goto done_opts;
1186
1187                         if (opcode == TCPOPT_MD5SIG) {
1188                                 hash_location = ptr;
1189                                 goto done_opts;
1190                         }
1191                 }
1192                 ptr += opsize-2;
1193                 length -= opsize;
1194         }
1195 done_opts:
1196         /* We've parsed the options - do we have a hash? */
1197         if (!hash_expected && !hash_location)
1198                 return 0;
1199
1200         if (hash_expected && !hash_location) {
1201                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1202                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1203                                NIPQUAD(iph->saddr), ntohs(th->source),
1204                                NIPQUAD(iph->daddr), ntohs(th->dest));
1205                 return 1;
1206         }
1207
1208         if (!hash_expected && hash_location) {
1209                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1210                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1211                                NIPQUAD(iph->saddr), ntohs(th->source),
1212                                NIPQUAD(iph->daddr), ntohs(th->dest));
1213                 return 1;
1214         }
1215
1216         /* Okay, so this is hash_expected and hash_location -
1217          * so we need to calculate the checksum.
1218          */
1219         genhash = tcp_v4_do_calc_md5_hash(newhash,
1220                                           hash_expected,
1221                                           iph->saddr, iph->daddr,
1222                                           th, sk->sk_protocol,
1223                                           skb->len);
1224
1225         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1226                 if (net_ratelimit()) {
1227                         printk(KERN_INFO "MD5 Hash failed for "
1228                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1229                                NIPQUAD(iph->saddr), ntohs(th->source),
1230                                NIPQUAD(iph->daddr), ntohs(th->dest),
1231                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1232                 }
1233                 return 1;
1234         }
1235         return 0;
1236 }
1237
1238 #endif
1239
1240 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1241         .family         =       PF_INET,
1242         .obj_size       =       sizeof(struct tcp_request_sock),
1243         .rtx_syn_ack    =       tcp_v4_send_synack,
1244         .send_ack       =       tcp_v4_reqsk_send_ack,
1245         .destructor     =       tcp_v4_reqsk_destructor,
1246         .send_reset     =       tcp_v4_send_reset,
1247 };
1248
1249 #ifdef CONFIG_TCP_MD5SIG
1250 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1251         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1252 };
1253 #endif
1254
1255 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1256         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1257         .twsk_unique    = tcp_twsk_unique,
1258         .twsk_destructor= tcp_twsk_destructor,
1259 };
1260
1261 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1262 {
1263         struct inet_request_sock *ireq;
1264         struct tcp_options_received tmp_opt;
1265         struct request_sock *req;
1266         __be32 saddr = ip_hdr(skb)->saddr;
1267         __be32 daddr = ip_hdr(skb)->daddr;
1268         __u32 isn = TCP_SKB_CB(skb)->when;
1269         struct dst_entry *dst = NULL;
1270 #ifdef CONFIG_SYN_COOKIES
1271         int want_cookie = 0;
1272 #else
1273 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1274 #endif
1275
1276         /* Never answer to SYNs send to broadcast or multicast */
1277         if (((struct rtable *)skb->dst)->rt_flags &
1278             (RTCF_BROADCAST | RTCF_MULTICAST))
1279                 goto drop;
1280
1281         /* TW buckets are converted to open requests without
1282          * limitations, they conserve resources and peer is
1283          * evidently real one.
1284          */
1285         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1286 #ifdef CONFIG_SYN_COOKIES
1287                 if (sysctl_tcp_syncookies) {
1288                         want_cookie = 1;
1289                 } else
1290 #endif
1291                 goto drop;
1292         }
1293
1294         /* Accept backlog is full. If we have already queued enough
1295          * of warm entries in syn queue, drop request. It is better than
1296          * clogging syn queue with openreqs with exponentially increasing
1297          * timeout.
1298          */
1299         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1300                 goto drop;
1301
1302         req = reqsk_alloc(&tcp_request_sock_ops);
1303         if (!req)
1304                 goto drop;
1305
1306 #ifdef CONFIG_TCP_MD5SIG
1307         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1308 #endif
1309
1310         tcp_clear_options(&tmp_opt);
1311         tmp_opt.mss_clamp = 536;
1312         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1313
1314         tcp_parse_options(skb, &tmp_opt, 0);
1315
1316         if (want_cookie) {
1317                 tcp_clear_options(&tmp_opt);
1318                 tmp_opt.saw_tstamp = 0;
1319         }
1320
1321         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1322                 /* Some OSes (unknown ones, but I see them on web server, which
1323                  * contains information interesting only for windows'
1324                  * users) do not send their stamp in SYN. It is easy case.
1325                  * We simply do not advertise TS support.
1326                  */
1327                 tmp_opt.saw_tstamp = 0;
1328                 tmp_opt.tstamp_ok  = 0;
1329         }
1330         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1331
1332         tcp_openreq_init(req, &tmp_opt, skb);
1333
1334         if (security_inet_conn_request(sk, skb, req))
1335                 goto drop_and_free;
1336
1337         ireq = inet_rsk(req);
1338         ireq->loc_addr = daddr;
1339         ireq->rmt_addr = saddr;
1340         ireq->opt = tcp_v4_save_options(sk, skb);
1341         if (!want_cookie)
1342                 TCP_ECN_create_request(req, tcp_hdr(skb));
1343
1344         if (want_cookie) {
1345 #ifdef CONFIG_SYN_COOKIES
1346                 syn_flood_warning(skb);
1347 #endif
1348                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1349         } else if (!isn) {
1350                 struct inet_peer *peer = NULL;
1351
1352                 /* VJ's idea. We save last timestamp seen
1353                  * from the destination in peer table, when entering
1354                  * state TIME-WAIT, and check against it before
1355                  * accepting new connection request.
1356                  *
1357                  * If "isn" is not zero, this request hit alive
1358                  * timewait bucket, so that all the necessary checks
1359                  * are made in the function processing timewait state.
1360                  */
1361                 if (tmp_opt.saw_tstamp &&
1362                     tcp_death_row.sysctl_tw_recycle &&
1363                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1364                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1365                     peer->v4daddr == saddr) {
1366                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1367                             (s32)(peer->tcp_ts - req->ts_recent) >
1368                                                         TCP_PAWS_WINDOW) {
1369                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1370                                 dst_release(dst);
1371                                 goto drop_and_free;
1372                         }
1373                 }
1374                 /* Kill the following clause, if you dislike this way. */
1375                 else if (!sysctl_tcp_syncookies &&
1376                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1377                           (sysctl_max_syn_backlog >> 2)) &&
1378                          (!peer || !peer->tcp_ts_stamp) &&
1379                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1380                         /* Without syncookies last quarter of
1381                          * backlog is filled with destinations,
1382                          * proven to be alive.
1383                          * It means that we continue to communicate
1384                          * to destinations, already remembered
1385                          * to the moment of synflood.
1386                          */
1387                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1388                                        "request from %u.%u.%u.%u/%u\n",
1389                                        NIPQUAD(saddr),
1390                                        ntohs(tcp_hdr(skb)->source));
1391                         dst_release(dst);
1392                         goto drop_and_free;
1393                 }
1394
1395                 isn = tcp_v4_init_sequence(skb);
1396         }
1397         tcp_rsk(req)->snt_isn = isn;
1398
1399         if (tcp_v4_send_synack(sk, req, dst))
1400                 goto drop_and_free;
1401
1402         if (want_cookie) {
1403                 reqsk_free(req);
1404         } else {
1405                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1406         }
1407         return 0;
1408
1409 drop_and_free:
1410         reqsk_free(req);
1411 drop:
1412         return 0;
1413 }
1414
1415
1416 /*
1417  * The three way handshake has completed - we got a valid synack -
1418  * now create the new socket.
1419  */
1420 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1421                                   struct request_sock *req,
1422                                   struct dst_entry *dst)
1423 {
1424         struct inet_request_sock *ireq;
1425         struct inet_sock *newinet;
1426         struct tcp_sock *newtp;
1427         struct sock *newsk;
1428 #ifdef CONFIG_TCP_MD5SIG
1429         struct tcp_md5sig_key *key;
1430 #endif
1431
1432         if (sk_acceptq_is_full(sk))
1433                 goto exit_overflow;
1434
1435         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1436                 goto exit;
1437
1438         newsk = tcp_create_openreq_child(sk, req, skb);
1439         if (!newsk)
1440                 goto exit;
1441
1442         newsk->sk_gso_type = SKB_GSO_TCPV4;
1443         sk_setup_caps(newsk, dst);
1444
1445         newtp                 = tcp_sk(newsk);
1446         newinet               = inet_sk(newsk);
1447         ireq                  = inet_rsk(req);
1448         newinet->daddr        = ireq->rmt_addr;
1449         newinet->rcv_saddr    = ireq->loc_addr;
1450         newinet->saddr        = ireq->loc_addr;
1451         newinet->opt          = ireq->opt;
1452         ireq->opt             = NULL;
1453         newinet->mc_index     = inet_iif(skb);
1454         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1455         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1456         if (newinet->opt)
1457                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1458         newinet->id = newtp->write_seq ^ jiffies;
1459
1460         tcp_mtup_init(newsk);
1461         tcp_sync_mss(newsk, dst_mtu(dst));
1462         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1463         tcp_initialize_rcv_mss(newsk);
1464
1465 #ifdef CONFIG_TCP_MD5SIG
1466         /* Copy over the MD5 key from the original socket */
1467         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1468                 /*
1469                  * We're using one, so create a matching key
1470                  * on the newsk structure. If we fail to get
1471                  * memory, then we end up not copying the key
1472                  * across. Shucks.
1473                  */
1474                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1475                 if (newkey != NULL)
1476                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1477                                           newkey, key->keylen);
1478         }
1479 #endif
1480
1481         __inet_hash(&tcp_hashinfo, newsk, 0);
1482         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1483
1484         return newsk;
1485
1486 exit_overflow:
1487         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1488 exit:
1489         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1490         dst_release(dst);
1491         return NULL;
1492 }
1493
1494 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1495 {
1496         struct tcphdr *th = tcp_hdr(skb);
1497         const struct iphdr *iph = ip_hdr(skb);
1498         struct sock *nsk;
1499         struct request_sock **prev;
1500         /* Find possible connection requests. */
1501         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1502                                                        iph->saddr, iph->daddr);
1503         if (req)
1504                 return tcp_check_req(sk, skb, req, prev);
1505
1506         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1507                                       iph->daddr, th->dest, inet_iif(skb));
1508
1509         if (nsk) {
1510                 if (nsk->sk_state != TCP_TIME_WAIT) {
1511                         bh_lock_sock(nsk);
1512                         return nsk;
1513                 }
1514                 inet_twsk_put(inet_twsk(nsk));
1515                 return NULL;
1516         }
1517
1518 #ifdef CONFIG_SYN_COOKIES
1519         if (!th->rst && !th->syn && th->ack)
1520                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1521 #endif
1522         return sk;
1523 }
1524
1525 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1526 {
1527         const struct iphdr *iph = ip_hdr(skb);
1528
1529         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1530                 if (!tcp_v4_check(skb->len, iph->saddr,
1531                                   iph->daddr, skb->csum)) {
1532                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1533                         return 0;
1534                 }
1535         }
1536
1537         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1538                                        skb->len, IPPROTO_TCP, 0);
1539
1540         if (skb->len <= 76) {
1541                 return __skb_checksum_complete(skb);
1542         }
1543         return 0;
1544 }
1545
1546
1547 /* The socket must have it's spinlock held when we get
1548  * here.
1549  *
1550  * We have a potential double-lock case here, so even when
1551  * doing backlog processing we use the BH locking scheme.
1552  * This is because we cannot sleep with the original spinlock
1553  * held.
1554  */
1555 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1556 {
1557         struct sock *rsk;
1558 #ifdef CONFIG_TCP_MD5SIG
1559         /*
1560          * We really want to reject the packet as early as possible
1561          * if:
1562          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1563          *  o There is an MD5 option and we're not expecting one
1564          */
1565         if (tcp_v4_inbound_md5_hash(sk, skb))
1566                 goto discard;
1567 #endif
1568
1569         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1570                 TCP_CHECK_TIMER(sk);
1571                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1572                         rsk = sk;
1573                         goto reset;
1574                 }
1575                 TCP_CHECK_TIMER(sk);
1576                 return 0;
1577         }
1578
1579         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1580                 goto csum_err;
1581
1582         if (sk->sk_state == TCP_LISTEN) {
1583                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1584                 if (!nsk)
1585                         goto discard;
1586
1587                 if (nsk != sk) {
1588                         if (tcp_child_process(sk, nsk, skb)) {
1589                                 rsk = nsk;
1590                                 goto reset;
1591                         }
1592                         return 0;
1593                 }
1594         }
1595
1596         TCP_CHECK_TIMER(sk);
1597         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1598                 rsk = sk;
1599                 goto reset;
1600         }
1601         TCP_CHECK_TIMER(sk);
1602         return 0;
1603
1604 reset:
1605         tcp_v4_send_reset(rsk, skb);
1606 discard:
1607         kfree_skb(skb);
1608         /* Be careful here. If this function gets more complicated and
1609          * gcc suffers from register pressure on the x86, sk (in %ebx)
1610          * might be destroyed here. This current version compiles correctly,
1611          * but you have been warned.
1612          */
1613         return 0;
1614
1615 csum_err:
1616         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1617         goto discard;
1618 }
1619
1620 /*
1621  *      From tcp_input.c
1622  */
1623
1624 int tcp_v4_rcv(struct sk_buff *skb)
1625 {
1626         const struct iphdr *iph;
1627         struct tcphdr *th;
1628         struct sock *sk;
1629         int ret;
1630
1631         if (skb->pkt_type != PACKET_HOST)
1632                 goto discard_it;
1633
1634         /* Count it even if it's bad */
1635         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1636
1637         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1638                 goto discard_it;
1639
1640         th = tcp_hdr(skb);
1641
1642         if (th->doff < sizeof(struct tcphdr) / 4)
1643                 goto bad_packet;
1644         if (!pskb_may_pull(skb, th->doff * 4))
1645                 goto discard_it;
1646
1647         /* An explanation is required here, I think.
1648          * Packet length and doff are validated by header prediction,
1649          * provided case of th->doff==0 is eliminated.
1650          * So, we defer the checks. */
1651         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1652                 goto bad_packet;
1653
1654         th = tcp_hdr(skb);
1655         iph = ip_hdr(skb);
1656         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1657         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1658                                     skb->len - th->doff * 4);
1659         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1660         TCP_SKB_CB(skb)->when    = 0;
1661         TCP_SKB_CB(skb)->flags   = iph->tos;
1662         TCP_SKB_CB(skb)->sacked  = 0;
1663
1664         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1665                            iph->daddr, th->dest, inet_iif(skb));
1666         if (!sk)
1667                 goto no_tcp_socket;
1668
1669 process:
1670         if (sk->sk_state == TCP_TIME_WAIT)
1671                 goto do_time_wait;
1672
1673         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1674                 goto discard_and_relse;
1675         nf_reset(skb);
1676
1677         if (sk_filter(sk, skb))
1678                 goto discard_and_relse;
1679
1680         skb->dev = NULL;
1681
1682         bh_lock_sock_nested(sk);
1683         ret = 0;
1684         if (!sock_owned_by_user(sk)) {
1685 #ifdef CONFIG_NET_DMA
1686                 struct tcp_sock *tp = tcp_sk(sk);
1687                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1688                         tp->ucopy.dma_chan = get_softnet_dma();
1689                 if (tp->ucopy.dma_chan)
1690                         ret = tcp_v4_do_rcv(sk, skb);
1691                 else
1692 #endif
1693                 {
1694                         if (!tcp_prequeue(sk, skb))
1695                         ret = tcp_v4_do_rcv(sk, skb);
1696                 }
1697         } else
1698                 sk_add_backlog(sk, skb);
1699         bh_unlock_sock(sk);
1700
1701         sock_put(sk);
1702
1703         return ret;
1704
1705 no_tcp_socket:
1706         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1707                 goto discard_it;
1708
1709         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1710 bad_packet:
1711                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1712         } else {
1713                 tcp_v4_send_reset(NULL, skb);
1714         }
1715
1716 discard_it:
1717         /* Discard frame. */
1718         kfree_skb(skb);
1719         return 0;
1720
1721 discard_and_relse:
1722         sock_put(sk);
1723         goto discard_it;
1724
1725 do_time_wait:
1726         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1727                 inet_twsk_put(inet_twsk(sk));
1728                 goto discard_it;
1729         }
1730
1731         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1732                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1733                 inet_twsk_put(inet_twsk(sk));
1734                 goto discard_it;
1735         }
1736         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1737         case TCP_TW_SYN: {
1738                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1739                                                         iph->daddr, th->dest,
1740                                                         inet_iif(skb));
1741                 if (sk2) {
1742                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743                         inet_twsk_put(inet_twsk(sk));
1744                         sk = sk2;
1745                         goto process;
1746                 }
1747                 /* Fall through to ACK */
1748         }
1749         case TCP_TW_ACK:
1750                 tcp_v4_timewait_ack(sk, skb);
1751                 break;
1752         case TCP_TW_RST:
1753                 goto no_tcp_socket;
1754         case TCP_TW_SUCCESS:;
1755         }
1756         goto discard_it;
1757 }
1758
1759 /* VJ's idea. Save last timestamp seen from this destination
1760  * and hold it at least for normal timewait interval to use for duplicate
1761  * segment detection in subsequent connections, before they enter synchronized
1762  * state.
1763  */
1764
1765 int tcp_v4_remember_stamp(struct sock *sk)
1766 {
1767         struct inet_sock *inet = inet_sk(sk);
1768         struct tcp_sock *tp = tcp_sk(sk);
1769         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1770         struct inet_peer *peer = NULL;
1771         int release_it = 0;
1772
1773         if (!rt || rt->rt_dst != inet->daddr) {
1774                 peer = inet_getpeer(inet->daddr, 1);
1775                 release_it = 1;
1776         } else {
1777                 if (!rt->peer)
1778                         rt_bind_peer(rt, 1);
1779                 peer = rt->peer;
1780         }
1781
1782         if (peer) {
1783                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1784                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1785                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1786                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1787                         peer->tcp_ts = tp->rx_opt.ts_recent;
1788                 }
1789                 if (release_it)
1790                         inet_putpeer(peer);
1791                 return 1;
1792         }
1793
1794         return 0;
1795 }
1796
1797 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1798 {
1799         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1800
1801         if (peer) {
1802                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1803
1804                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1805                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1806                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1807                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1808                         peer->tcp_ts       = tcptw->tw_ts_recent;
1809                 }
1810                 inet_putpeer(peer);
1811                 return 1;
1812         }
1813
1814         return 0;
1815 }
1816
1817 struct inet_connection_sock_af_ops ipv4_specific = {
1818         .queue_xmit        = ip_queue_xmit,
1819         .send_check        = tcp_v4_send_check,
1820         .rebuild_header    = inet_sk_rebuild_header,
1821         .conn_request      = tcp_v4_conn_request,
1822         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1823         .remember_stamp    = tcp_v4_remember_stamp,
1824         .net_header_len    = sizeof(struct iphdr),
1825         .setsockopt        = ip_setsockopt,
1826         .getsockopt        = ip_getsockopt,
1827         .addr2sockaddr     = inet_csk_addr2sockaddr,
1828         .sockaddr_len      = sizeof(struct sockaddr_in),
1829 #ifdef CONFIG_COMPAT
1830         .compat_setsockopt = compat_ip_setsockopt,
1831         .compat_getsockopt = compat_ip_getsockopt,
1832 #endif
1833 };
1834
1835 #ifdef CONFIG_TCP_MD5SIG
1836 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1837         .md5_lookup             = tcp_v4_md5_lookup,
1838         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1839         .md5_add                = tcp_v4_md5_add_func,
1840         .md5_parse              = tcp_v4_parse_md5_keys,
1841 };
1842 #endif
1843
1844 /* NOTE: A lot of things set to zero explicitly by call to
1845  *       sk_alloc() so need not be done here.
1846  */
1847 static int tcp_v4_init_sock(struct sock *sk)
1848 {
1849         struct inet_connection_sock *icsk = inet_csk(sk);
1850         struct tcp_sock *tp = tcp_sk(sk);
1851
1852         skb_queue_head_init(&tp->out_of_order_queue);
1853         tcp_init_xmit_timers(sk);
1854         tcp_prequeue_init(tp);
1855
1856         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1857         tp->mdev = TCP_TIMEOUT_INIT;
1858
1859         /* So many TCP implementations out there (incorrectly) count the
1860          * initial SYN frame in their delayed-ACK and congestion control
1861          * algorithms that we must have the following bandaid to talk
1862          * efficiently to them.  -DaveM
1863          */
1864         tp->snd_cwnd = 2;
1865
1866         /* See draft-stevens-tcpca-spec-01 for discussion of the
1867          * initialization of these values.
1868          */
1869         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1870         tp->snd_cwnd_clamp = ~0;
1871         tp->mss_cache = 536;
1872
1873         tp->reordering = sysctl_tcp_reordering;
1874         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1875
1876         sk->sk_state = TCP_CLOSE;
1877
1878         sk->sk_write_space = sk_stream_write_space;
1879         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1880
1881         icsk->icsk_af_ops = &ipv4_specific;
1882         icsk->icsk_sync_mss = tcp_sync_mss;
1883 #ifdef CONFIG_TCP_MD5SIG
1884         tp->af_specific = &tcp_sock_ipv4_specific;
1885 #endif
1886
1887         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1888         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1889
1890         atomic_inc(&tcp_sockets_allocated);
1891
1892         return 0;
1893 }
1894
1895 int tcp_v4_destroy_sock(struct sock *sk)
1896 {
1897         struct tcp_sock *tp = tcp_sk(sk);
1898
1899         tcp_clear_xmit_timers(sk);
1900
1901         tcp_cleanup_congestion_control(sk);
1902
1903         /* Cleanup up the write buffer. */
1904         tcp_write_queue_purge(sk);
1905
1906         /* Cleans up our, hopefully empty, out_of_order_queue. */
1907         __skb_queue_purge(&tp->out_of_order_queue);
1908
1909 #ifdef CONFIG_TCP_MD5SIG
1910         /* Clean up the MD5 key list, if any */
1911         if (tp->md5sig_info) {
1912                 tcp_v4_clear_md5_list(sk);
1913                 kfree(tp->md5sig_info);
1914                 tp->md5sig_info = NULL;
1915         }
1916 #endif
1917
1918 #ifdef CONFIG_NET_DMA
1919         /* Cleans up our sk_async_wait_queue */
1920         __skb_queue_purge(&sk->sk_async_wait_queue);
1921 #endif
1922
1923         /* Clean prequeue, it must be empty really */
1924         __skb_queue_purge(&tp->ucopy.prequeue);
1925
1926         /* Clean up a referenced TCP bind bucket. */
1927         if (inet_csk(sk)->icsk_bind_hash)
1928                 inet_put_port(&tcp_hashinfo, sk);
1929
1930         /*
1931          * If sendmsg cached page exists, toss it.
1932          */
1933         if (sk->sk_sndmsg_page) {
1934                 __free_page(sk->sk_sndmsg_page);
1935                 sk->sk_sndmsg_page = NULL;
1936         }
1937
1938         atomic_dec(&tcp_sockets_allocated);
1939
1940         return 0;
1941 }
1942
1943 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1944
1945 #ifdef CONFIG_PROC_FS
1946 /* Proc filesystem TCP sock list dumping. */
1947
1948 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1949 {
1950         return hlist_empty(head) ? NULL :
1951                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1952 }
1953
1954 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1955 {
1956         return tw->tw_node.next ?
1957                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1958 }
1959
1960 static void *listening_get_next(struct seq_file *seq, void *cur)
1961 {
1962         struct inet_connection_sock *icsk;
1963         struct hlist_node *node;
1964         struct sock *sk = cur;
1965         struct tcp_iter_state* st = seq->private;
1966
1967         if (!sk) {
1968                 st->bucket = 0;
1969                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1970                 goto get_sk;
1971         }
1972
1973         ++st->num;
1974
1975         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1976                 struct request_sock *req = cur;
1977
1978                 icsk = inet_csk(st->syn_wait_sk);
1979                 req = req->dl_next;
1980                 while (1) {
1981                         while (req) {
1982                                 if (req->rsk_ops->family == st->family) {
1983                                         cur = req;
1984                                         goto out;
1985                                 }
1986                                 req = req->dl_next;
1987                         }
1988                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1989                                 break;
1990 get_req:
1991                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1992                 }
1993                 sk        = sk_next(st->syn_wait_sk);
1994                 st->state = TCP_SEQ_STATE_LISTENING;
1995                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1996         } else {
1997                 icsk = inet_csk(sk);
1998                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1999                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2000                         goto start_req;
2001                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2002                 sk = sk_next(sk);
2003         }
2004 get_sk:
2005         sk_for_each_from(sk, node) {
2006                 if (sk->sk_family == st->family) {
2007                         cur = sk;
2008                         goto out;
2009                 }
2010                 icsk = inet_csk(sk);
2011                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2013 start_req:
2014                         st->uid         = sock_i_uid(sk);
2015                         st->syn_wait_sk = sk;
2016                         st->state       = TCP_SEQ_STATE_OPENREQ;
2017                         st->sbucket     = 0;
2018                         goto get_req;
2019                 }
2020                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2021         }
2022         if (++st->bucket < INET_LHTABLE_SIZE) {
2023                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2024                 goto get_sk;
2025         }
2026         cur = NULL;
2027 out:
2028         return cur;
2029 }
2030
2031 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2032 {
2033         void *rc = listening_get_next(seq, NULL);
2034
2035         while (rc && *pos) {
2036                 rc = listening_get_next(seq, rc);
2037                 --*pos;
2038         }
2039         return rc;
2040 }
2041
2042 static void *established_get_first(struct seq_file *seq)
2043 {
2044         struct tcp_iter_state* st = seq->private;
2045         void *rc = NULL;
2046
2047         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2048                 struct sock *sk;
2049                 struct hlist_node *node;
2050                 struct inet_timewait_sock *tw;
2051                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2052
2053                 read_lock_bh(lock);
2054                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2055                         if (sk->sk_family != st->family) {
2056                                 continue;
2057                         }
2058                         rc = sk;
2059                         goto out;
2060                 }
2061                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2062                 inet_twsk_for_each(tw, node,
2063                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2064                         if (tw->tw_family != st->family) {
2065                                 continue;
2066                         }
2067                         rc = tw;
2068                         goto out;
2069                 }
2070                 read_unlock_bh(lock);
2071                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2072         }
2073 out:
2074         return rc;
2075 }
2076
2077 static void *established_get_next(struct seq_file *seq, void *cur)
2078 {
2079         struct sock *sk = cur;
2080         struct inet_timewait_sock *tw;
2081         struct hlist_node *node;
2082         struct tcp_iter_state* st = seq->private;
2083
2084         ++st->num;
2085
2086         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2087                 tw = cur;
2088                 tw = tw_next(tw);
2089 get_tw:
2090                 while (tw && tw->tw_family != st->family) {
2091                         tw = tw_next(tw);
2092                 }
2093                 if (tw) {
2094                         cur = tw;
2095                         goto out;
2096                 }
2097                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2099
2100                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2101                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2102                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2103                 } else {
2104                         cur = NULL;
2105                         goto out;
2106                 }
2107         } else
2108                 sk = sk_next(sk);
2109
2110         sk_for_each_from(sk, node) {
2111                 if (sk->sk_family == st->family)
2112                         goto found;
2113         }
2114
2115         st->state = TCP_SEQ_STATE_TIME_WAIT;
2116         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2117         goto get_tw;
2118 found:
2119         cur = sk;
2120 out:
2121         return cur;
2122 }
2123
2124 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2125 {
2126         void *rc = established_get_first(seq);
2127
2128         while (rc && pos) {
2129                 rc = established_get_next(seq, rc);
2130                 --pos;
2131         }
2132         return rc;
2133 }
2134
2135 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2136 {
2137         void *rc;
2138         struct tcp_iter_state* st = seq->private;
2139
2140         inet_listen_lock(&tcp_hashinfo);
2141         st->state = TCP_SEQ_STATE_LISTENING;
2142         rc        = listening_get_idx(seq, &pos);
2143
2144         if (!rc) {
2145                 inet_listen_unlock(&tcp_hashinfo);
2146                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2147                 rc        = established_get_idx(seq, pos);
2148         }
2149
2150         return rc;
2151 }
2152
2153 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2154 {
2155         struct tcp_iter_state* st = seq->private;
2156         st->state = TCP_SEQ_STATE_LISTENING;
2157         st->num = 0;
2158         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2159 }
2160
2161 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2162 {
2163         void *rc = NULL;
2164         struct tcp_iter_state* st;
2165
2166         if (v == SEQ_START_TOKEN) {
2167                 rc = tcp_get_idx(seq, 0);
2168                 goto out;
2169         }
2170         st = seq->private;
2171
2172         switch (st->state) {
2173         case TCP_SEQ_STATE_OPENREQ:
2174         case TCP_SEQ_STATE_LISTENING:
2175                 rc = listening_get_next(seq, v);
2176                 if (!rc) {
2177                         inet_listen_unlock(&tcp_hashinfo);
2178                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2179                         rc        = established_get_first(seq);
2180                 }
2181                 break;
2182         case TCP_SEQ_STATE_ESTABLISHED:
2183         case TCP_SEQ_STATE_TIME_WAIT:
2184                 rc = established_get_next(seq, v);
2185                 break;
2186         }
2187 out:
2188         ++*pos;
2189         return rc;
2190 }
2191
2192 static void tcp_seq_stop(struct seq_file *seq, void *v)
2193 {
2194         struct tcp_iter_state* st = seq->private;
2195
2196         switch (st->state) {
2197         case TCP_SEQ_STATE_OPENREQ:
2198                 if (v) {
2199                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2200                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2201                 }
2202         case TCP_SEQ_STATE_LISTENING:
2203                 if (v != SEQ_START_TOKEN)
2204                         inet_listen_unlock(&tcp_hashinfo);
2205                 break;
2206         case TCP_SEQ_STATE_TIME_WAIT:
2207         case TCP_SEQ_STATE_ESTABLISHED:
2208                 if (v)
2209                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2210                 break;
2211         }
2212 }
2213
2214 static int tcp_seq_open(struct inode *inode, struct file *file)
2215 {
2216         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2217         struct seq_file *seq;
2218         struct tcp_iter_state *s;
2219         int rc;
2220
2221         if (unlikely(afinfo == NULL))
2222                 return -EINVAL;
2223
2224         s = kzalloc(sizeof(*s), GFP_KERNEL);
2225         if (!s)
2226                 return -ENOMEM;
2227         s->family               = afinfo->family;
2228         s->seq_ops.start        = tcp_seq_start;
2229         s->seq_ops.next         = tcp_seq_next;
2230         s->seq_ops.show         = afinfo->seq_show;
2231         s->seq_ops.stop         = tcp_seq_stop;
2232
2233         rc = seq_open(file, &s->seq_ops);
2234         if (rc)
2235                 goto out_kfree;
2236         seq          = file->private_data;
2237         seq->private = s;
2238 out:
2239         return rc;
2240 out_kfree:
2241         kfree(s);
2242         goto out;
2243 }
2244
2245 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2246 {
2247         int rc = 0;
2248         struct proc_dir_entry *p;
2249
2250         if (!afinfo)
2251                 return -EINVAL;
2252         afinfo->seq_fops->owner         = afinfo->owner;
2253         afinfo->seq_fops->open          = tcp_seq_open;
2254         afinfo->seq_fops->read          = seq_read;
2255         afinfo->seq_fops->llseek        = seq_lseek;
2256         afinfo->seq_fops->release       = seq_release_private;
2257
2258         p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2259         if (p)
2260                 p->data = afinfo;
2261         else
2262                 rc = -ENOMEM;
2263         return rc;
2264 }
2265
2266 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2267 {
2268         if (!afinfo)
2269                 return;
2270         proc_net_remove(&init_net, afinfo->name);
2271         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2272 }
2273
2274 static void get_openreq4(struct sock *sk, struct request_sock *req,
2275                          char *tmpbuf, int i, int uid)
2276 {
2277         const struct inet_request_sock *ireq = inet_rsk(req);
2278         int ttd = req->expires - jiffies;
2279
2280         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2281                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2282                 i,
2283                 ireq->loc_addr,
2284                 ntohs(inet_sk(sk)->sport),
2285                 ireq->rmt_addr,
2286                 ntohs(ireq->rmt_port),
2287                 TCP_SYN_RECV,
2288                 0, 0, /* could print option size, but that is af dependent. */
2289                 1,    /* timers active (only the expire timer) */
2290                 jiffies_to_clock_t(ttd),
2291                 req->retrans,
2292                 uid,
2293                 0,  /* non standard timer */
2294                 0, /* open_requests have no inode */
2295                 atomic_read(&sk->sk_refcnt),
2296                 req);
2297 }
2298
2299 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2300 {
2301         int timer_active;
2302         unsigned long timer_expires;
2303         struct tcp_sock *tp = tcp_sk(sk);
2304         const struct inet_connection_sock *icsk = inet_csk(sk);
2305         struct inet_sock *inet = inet_sk(sk);
2306         __be32 dest = inet->daddr;
2307         __be32 src = inet->rcv_saddr;
2308         __u16 destp = ntohs(inet->dport);
2309         __u16 srcp = ntohs(inet->sport);
2310
2311         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2312                 timer_active    = 1;
2313                 timer_expires   = icsk->icsk_timeout;
2314         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2315                 timer_active    = 4;
2316                 timer_expires   = icsk->icsk_timeout;
2317         } else if (timer_pending(&sk->sk_timer)) {
2318                 timer_active    = 2;
2319                 timer_expires   = sk->sk_timer.expires;
2320         } else {
2321                 timer_active    = 0;
2322                 timer_expires = jiffies;
2323         }
2324
2325         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2326                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2327                 i, src, srcp, dest, destp, sk->sk_state,
2328                 tp->write_seq - tp->snd_una,
2329                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2330                                              (tp->rcv_nxt - tp->copied_seq),
2331                 timer_active,
2332                 jiffies_to_clock_t(timer_expires - jiffies),
2333                 icsk->icsk_retransmits,
2334                 sock_i_uid(sk),
2335                 icsk->icsk_probes_out,
2336                 sock_i_ino(sk),
2337                 atomic_read(&sk->sk_refcnt), sk,
2338                 icsk->icsk_rto,
2339                 icsk->icsk_ack.ato,
2340                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2341                 tp->snd_cwnd,
2342                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2343 }
2344
2345 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2346                                char *tmpbuf, int i)
2347 {
2348         __be32 dest, src;
2349         __u16 destp, srcp;
2350         int ttd = tw->tw_ttd - jiffies;
2351
2352         if (ttd < 0)
2353                 ttd = 0;
2354
2355         dest  = tw->tw_daddr;
2356         src   = tw->tw_rcv_saddr;
2357         destp = ntohs(tw->tw_dport);
2358         srcp  = ntohs(tw->tw_sport);
2359
2360         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2361                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2362                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2363                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2364                 atomic_read(&tw->tw_refcnt), tw);
2365 }
2366
2367 #define TMPSZ 150
2368
2369 static int tcp4_seq_show(struct seq_file *seq, void *v)
2370 {
2371         struct tcp_iter_state* st;
2372         char tmpbuf[TMPSZ + 1];
2373
2374         if (v == SEQ_START_TOKEN) {
2375                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2376                            "  sl  local_address rem_address   st tx_queue "
2377                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2378                            "inode");
2379                 goto out;
2380         }
2381         st = seq->private;
2382
2383         switch (st->state) {
2384         case TCP_SEQ_STATE_LISTENING:
2385         case TCP_SEQ_STATE_ESTABLISHED:
2386                 get_tcp4_sock(v, tmpbuf, st->num);
2387                 break;
2388         case TCP_SEQ_STATE_OPENREQ:
2389                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2390                 break;
2391         case TCP_SEQ_STATE_TIME_WAIT:
2392                 get_timewait4_sock(v, tmpbuf, st->num);
2393                 break;
2394         }
2395         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2396 out:
2397         return 0;
2398 }
2399
2400 static struct file_operations tcp4_seq_fops;
2401 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2402         .owner          = THIS_MODULE,
2403         .name           = "tcp",
2404         .family         = AF_INET,
2405         .seq_show       = tcp4_seq_show,
2406         .seq_fops       = &tcp4_seq_fops,
2407 };
2408
2409 int __init tcp4_proc_init(void)
2410 {
2411         return tcp_proc_register(&tcp4_seq_afinfo);
2412 }
2413
2414 void tcp4_proc_exit(void)
2415 {
2416         tcp_proc_unregister(&tcp4_seq_afinfo);
2417 }
2418 #endif /* CONFIG_PROC_FS */
2419
2420 DEFINE_PROTO_INUSE(tcp)
2421
2422 struct proto tcp_prot = {
2423         .name                   = "TCP",
2424         .owner                  = THIS_MODULE,
2425         .close                  = tcp_close,
2426         .connect                = tcp_v4_connect,
2427         .disconnect             = tcp_disconnect,
2428         .accept                 = inet_csk_accept,
2429         .ioctl                  = tcp_ioctl,
2430         .init                   = tcp_v4_init_sock,
2431         .destroy                = tcp_v4_destroy_sock,
2432         .shutdown               = tcp_shutdown,
2433         .setsockopt             = tcp_setsockopt,
2434         .getsockopt             = tcp_getsockopt,
2435         .recvmsg                = tcp_recvmsg,
2436         .backlog_rcv            = tcp_v4_do_rcv,
2437         .hash                   = tcp_v4_hash,
2438         .unhash                 = tcp_unhash,
2439         .get_port               = tcp_v4_get_port,
2440         .enter_memory_pressure  = tcp_enter_memory_pressure,
2441         .sockets_allocated      = &tcp_sockets_allocated,
2442         .orphan_count           = &tcp_orphan_count,
2443         .memory_allocated       = &tcp_memory_allocated,
2444         .memory_pressure        = &tcp_memory_pressure,
2445         .sysctl_mem             = sysctl_tcp_mem,
2446         .sysctl_wmem            = sysctl_tcp_wmem,
2447         .sysctl_rmem            = sysctl_tcp_rmem,
2448         .max_header             = MAX_TCP_HEADER,
2449         .obj_size               = sizeof(struct tcp_sock),
2450         .twsk_prot              = &tcp_timewait_sock_ops,
2451         .rsk_prot               = &tcp_request_sock_ops,
2452 #ifdef CONFIG_COMPAT
2453         .compat_setsockopt      = compat_tcp_setsockopt,
2454         .compat_getsockopt      = compat_tcp_getsockopt,
2455 #endif
2456         REF_PROTO_INUSE(tcp)
2457 };
2458
2459 void __init tcp_v4_init(struct net_proto_family *ops)
2460 {
2461         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2462                                      IPPROTO_TCP) < 0)
2463                 panic("Failed to create the TCP control socket.\n");
2464 }
2465
2466 EXPORT_SYMBOL(ipv4_specific);
2467 EXPORT_SYMBOL(tcp_hashinfo);
2468 EXPORT_SYMBOL(tcp_prot);
2469 EXPORT_SYMBOL(tcp_unhash);
2470 EXPORT_SYMBOL(tcp_v4_conn_request);
2471 EXPORT_SYMBOL(tcp_v4_connect);
2472 EXPORT_SYMBOL(tcp_v4_do_rcv);
2473 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2474 EXPORT_SYMBOL(tcp_v4_send_check);
2475 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2476
2477 #ifdef CONFIG_PROC_FS
2478 EXPORT_SYMBOL(tcp_proc_register);
2479 EXPORT_SYMBOL(tcp_proc_unregister);
2480 #endif
2481 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2482