Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket __read_mostly;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129                                           ip_hdr(skb)->saddr,
130                                           tcp_hdr(skb)->dest,
131                                           tcp_hdr(skb)->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk, 1);
195         if (tmp < 0) {
196                 if (tmp == -ENETUNREACH)
197                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
198                 return tmp;
199         }
200
201         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202                 ip_rt_put(rt);
203                 return -ENETUNREACH;
204         }
205
206         if (!inet->opt || !inet->opt->srr)
207                 daddr = rt->rt_dst;
208
209         if (!inet->saddr)
210                 inet->saddr = rt->rt_src;
211         inet->rcv_saddr = inet->saddr;
212
213         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
214                 /* Reset inherited state */
215                 tp->rx_opt.ts_recent       = 0;
216                 tp->rx_opt.ts_recent_stamp = 0;
217                 tp->write_seq              = 0;
218         }
219
220         if (tcp_death_row.sysctl_tw_recycle &&
221             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
222                 struct inet_peer *peer = rt_get_peer(rt);
223                 /*
224                  * VJ's idea. We save last timestamp seen from
225                  * the destination in peer table, when entering state
226                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
227                  * when trying new connection.
228                  */
229                 if (peer != NULL &&
230                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
231                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
232                         tp->rx_opt.ts_recent = peer->tcp_ts;
233                 }
234         }
235
236         inet->dport = usin->sin_port;
237         inet->daddr = daddr;
238
239         inet_csk(sk)->icsk_ext_hdr_len = 0;
240         if (inet->opt)
241                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
242
243         tp->rx_opt.mss_clamp = 536;
244
245         /* Socket identity is still unknown (sport may be zero).
246          * However we set state to SYN-SENT and not releasing socket
247          * lock select source port, enter ourselves into the hash tables and
248          * complete initialization after this.
249          */
250         tcp_set_state(sk, TCP_SYN_SENT);
251         err = inet_hash_connect(&tcp_death_row, sk);
252         if (err)
253                 goto failure;
254
255         err = ip_route_newports(&rt, IPPROTO_TCP,
256                                 inet->sport, inet->dport, sk);
257         if (err)
258                 goto failure;
259
260         /* OK, now commit destination to socket.  */
261         sk->sk_gso_type = SKB_GSO_TCPV4;
262         sk_setup_caps(sk, &rt->u.dst);
263
264         if (!tp->write_seq)
265                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
266                                                            inet->daddr,
267                                                            inet->sport,
268                                                            usin->sin_port);
269
270         inet->id = tp->write_seq ^ jiffies;
271
272         err = tcp_connect(sk);
273         rt = NULL;
274         if (err)
275                 goto failure;
276
277         return 0;
278
279 failure:
280         /*
281          * This unhashes the socket and releases the local port,
282          * if necessary.
283          */
284         tcp_set_state(sk, TCP_CLOSE);
285         ip_rt_put(rt);
286         sk->sk_route_caps = 0;
287         inet->dport = 0;
288         return err;
289 }
290
291 /*
292  * This routine does path mtu discovery as defined in RFC1191.
293  */
294 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
295 {
296         struct dst_entry *dst;
297         struct inet_sock *inet = inet_sk(sk);
298
299         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300          * send out by Linux are always <576bytes so they should go through
301          * unfragmented).
302          */
303         if (sk->sk_state == TCP_LISTEN)
304                 return;
305
306         /* We don't check in the destentry if pmtu discovery is forbidden
307          * on this route. We just assume that no packet_to_big packets
308          * are send back when pmtu discovery is not active.
309          * There is a small race when the user changes this flag in the
310          * route, but I think that's acceptable.
311          */
312         if ((dst = __sk_dst_check(sk, 0)) == NULL)
313                 return;
314
315         dst->ops->update_pmtu(dst, mtu);
316
317         /* Something is about to be wrong... Remember soft error
318          * for the case, if this connection will not able to recover.
319          */
320         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
321                 sk->sk_err_soft = EMSGSIZE;
322
323         mtu = dst_mtu(dst);
324
325         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
326             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
327                 tcp_sync_mss(sk, mtu);
328
329                 /* Resend the TCP packet because it's
330                  * clear that the old packet has been
331                  * dropped. This is the new "fast" path mtu
332                  * discovery.
333                  */
334                 tcp_simple_retransmit(sk);
335         } /* else let the usual retransmit timer handle it */
336 }
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *skb, u32 info)
355 {
356         struct iphdr *iph = (struct iphdr *)skb->data;
357         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
358         struct tcp_sock *tp;
359         struct inet_sock *inet;
360         const int type = icmp_hdr(skb)->type;
361         const int code = icmp_hdr(skb)->code;
362         struct sock *sk;
363         __u32 seq;
364         int err;
365
366         if (skb->len < (iph->ihl << 2) + 8) {
367                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
368                 return;
369         }
370
371         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
372                          th->source, inet_iif(skb));
373         if (!sk) {
374                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
375                 return;
376         }
377         if (sk->sk_state == TCP_TIME_WAIT) {
378                 inet_twsk_put(inet_twsk(sk));
379                 return;
380         }
381
382         bh_lock_sock(sk);
383         /* If too many ICMPs get dropped on busy
384          * servers this needs to be solved differently.
385          */
386         if (sock_owned_by_user(sk))
387                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
388
389         if (sk->sk_state == TCP_CLOSE)
390                 goto out;
391
392         tp = tcp_sk(sk);
393         seq = ntohl(th->seq);
394         if (sk->sk_state != TCP_LISTEN &&
395             !between(seq, tp->snd_una, tp->snd_nxt)) {
396                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
397                 goto out;
398         }
399
400         switch (type) {
401         case ICMP_SOURCE_QUENCH:
402                 /* Just silently ignore these. */
403                 goto out;
404         case ICMP_PARAMETERPROB:
405                 err = EPROTO;
406                 break;
407         case ICMP_DEST_UNREACH:
408                 if (code > NR_ICMP_UNREACH)
409                         goto out;
410
411                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
412                         if (!sock_owned_by_user(sk))
413                                 do_pmtu_discovery(sk, iph, info);
414                         goto out;
415                 }
416
417                 err = icmp_err_convert[code].errno;
418                 break;
419         case ICMP_TIME_EXCEEDED:
420                 err = EHOSTUNREACH;
421                 break;
422         default:
423                 goto out;
424         }
425
426         switch (sk->sk_state) {
427                 struct request_sock *req, **prev;
428         case TCP_LISTEN:
429                 if (sock_owned_by_user(sk))
430                         goto out;
431
432                 req = inet_csk_search_req(sk, &prev, th->dest,
433                                           iph->daddr, iph->saddr);
434                 if (!req)
435                         goto out;
436
437                 /* ICMPs are not backlogged, hence we cannot get
438                    an established socket here.
439                  */
440                 BUG_TRAP(!req->sk);
441
442                 if (seq != tcp_rsk(req)->snt_isn) {
443                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
444                         goto out;
445                 }
446
447                 /*
448                  * Still in SYN_RECV, just remove it silently.
449                  * There is no good way to pass the error to the newly
450                  * created socket, and POSIX does not want network
451                  * errors returned from accept().
452                  */
453                 inet_csk_reqsk_queue_drop(sk, req, prev);
454                 goto out;
455
456         case TCP_SYN_SENT:
457         case TCP_SYN_RECV:  /* Cannot happen.
458                                It can f.e. if SYNs crossed.
459                              */
460                 if (!sock_owned_by_user(sk)) {
461                         sk->sk_err = err;
462
463                         sk->sk_error_report(sk);
464
465                         tcp_done(sk);
466                 } else {
467                         sk->sk_err_soft = err;
468                 }
469                 goto out;
470         }
471
472         /* If we've already connected we will keep trying
473          * until we time out, or the user gives up.
474          *
475          * rfc1122 4.2.3.9 allows to consider as hard errors
476          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
477          * but it is obsoleted by pmtu discovery).
478          *
479          * Note, that in modern internet, where routing is unreliable
480          * and in each dark corner broken firewalls sit, sending random
481          * errors ordered by their masters even this two messages finally lose
482          * their original sense (even Linux sends invalid PORT_UNREACHs)
483          *
484          * Now we are in compliance with RFCs.
485          *                                                      --ANK (980905)
486          */
487
488         inet = inet_sk(sk);
489         if (!sock_owned_by_user(sk) && inet->recverr) {
490                 sk->sk_err = err;
491                 sk->sk_error_report(sk);
492         } else  { /* Only an error on timeout */
493                 sk->sk_err_soft = err;
494         }
495
496 out:
497         bh_unlock_sock(sk);
498         sock_put(sk);
499 }
500
501 /* This routine computes an IPv4 TCP checksum. */
502 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
503 {
504         struct inet_sock *inet = inet_sk(sk);
505         struct tcphdr *th = tcp_hdr(skb);
506
507         if (skb->ip_summed == CHECKSUM_PARTIAL) {
508                 th->check = ~tcp_v4_check(len, inet->saddr,
509                                           inet->daddr, 0);
510                 skb->csum_start = skb_transport_header(skb) - skb->head;
511                 skb->csum_offset = offsetof(struct tcphdr, check);
512         } else {
513                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
514                                          csum_partial((char *)th,
515                                                       th->doff << 2,
516                                                       skb->csum));
517         }
518 }
519
520 int tcp_v4_gso_send_check(struct sk_buff *skb)
521 {
522         const struct iphdr *iph;
523         struct tcphdr *th;
524
525         if (!pskb_may_pull(skb, sizeof(*th)))
526                 return -EINVAL;
527
528         iph = ip_hdr(skb);
529         th = tcp_hdr(skb);
530
531         th->check = 0;
532         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
533         skb->csum_start = skb_transport_header(skb) - skb->head;
534         skb->csum_offset = offsetof(struct tcphdr, check);
535         skb->ip_summed = CHECKSUM_PARTIAL;
536         return 0;
537 }
538
539 /*
540  *      This routine will send an RST to the other tcp.
541  *
542  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
543  *                    for reset.
544  *      Answer: if a packet caused RST, it is not for a socket
545  *              existing in our system, if it is matched to a socket,
546  *              it is just duplicate segment or bug in other side's TCP.
547  *              So that we build reply only basing on parameters
548  *              arrived with segment.
549  *      Exception: precedence violation. We do not implement it in any case.
550  */
551
552 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
553 {
554         struct tcphdr *th = tcp_hdr(skb);
555         struct {
556                 struct tcphdr th;
557 #ifdef CONFIG_TCP_MD5SIG
558                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
559 #endif
560         } rep;
561         struct ip_reply_arg arg;
562 #ifdef CONFIG_TCP_MD5SIG
563         struct tcp_md5sig_key *key;
564 #endif
565
566         /* Never send a reset in response to a reset. */
567         if (th->rst)
568                 return;
569
570         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
571                 return;
572
573         /* Swap the send and the receive. */
574         memset(&rep, 0, sizeof(rep));
575         rep.th.dest   = th->source;
576         rep.th.source = th->dest;
577         rep.th.doff   = sizeof(struct tcphdr) / 4;
578         rep.th.rst    = 1;
579
580         if (th->ack) {
581                 rep.th.seq = th->ack_seq;
582         } else {
583                 rep.th.ack = 1;
584                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
585                                        skb->len - (th->doff << 2));
586         }
587
588         memset(&arg, 0, sizeof(arg));
589         arg.iov[0].iov_base = (unsigned char *)&rep;
590         arg.iov[0].iov_len  = sizeof(rep.th);
591
592 #ifdef CONFIG_TCP_MD5SIG
593         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
594         if (key) {
595                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
596                                    (TCPOPT_NOP << 16) |
597                                    (TCPOPT_MD5SIG << 8) |
598                                    TCPOLEN_MD5SIG);
599                 /* Update length and the length the header thinks exists */
600                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
601                 rep.th.doff = arg.iov[0].iov_len / 4;
602
603                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
604                                         key,
605                                         ip_hdr(skb)->daddr,
606                                         ip_hdr(skb)->saddr,
607                                         &rep.th, IPPROTO_TCP,
608                                         arg.iov[0].iov_len);
609         }
610 #endif
611         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
612                                       ip_hdr(skb)->saddr, /* XXX */
613                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
614         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
615
616         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
617
618         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
619         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
620 }
621
622 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
623    outside socket context is ugly, certainly. What can I do?
624  */
625
626 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
627                             struct sk_buff *skb, u32 seq, u32 ack,
628                             u32 win, u32 ts)
629 {
630         struct tcphdr *th = tcp_hdr(skb);
631         struct {
632                 struct tcphdr th;
633                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
634 #ifdef CONFIG_TCP_MD5SIG
635                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
636 #endif
637                         ];
638         } rep;
639         struct ip_reply_arg arg;
640 #ifdef CONFIG_TCP_MD5SIG
641         struct tcp_md5sig_key *key;
642         struct tcp_md5sig_key tw_key;
643 #endif
644
645         memset(&rep.th, 0, sizeof(struct tcphdr));
646         memset(&arg, 0, sizeof(arg));
647
648         arg.iov[0].iov_base = (unsigned char *)&rep;
649         arg.iov[0].iov_len  = sizeof(rep.th);
650         if (ts) {
651                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
652                                    (TCPOPT_TIMESTAMP << 8) |
653                                    TCPOLEN_TIMESTAMP);
654                 rep.opt[1] = htonl(tcp_time_stamp);
655                 rep.opt[2] = htonl(ts);
656                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
657         }
658
659         /* Swap the send and the receive. */
660         rep.th.dest    = th->source;
661         rep.th.source  = th->dest;
662         rep.th.doff    = arg.iov[0].iov_len / 4;
663         rep.th.seq     = htonl(seq);
664         rep.th.ack_seq = htonl(ack);
665         rep.th.ack     = 1;
666         rep.th.window  = htons(win);
667
668 #ifdef CONFIG_TCP_MD5SIG
669         /*
670          * The SKB holds an imcoming packet, but may not have a valid ->sk
671          * pointer. This is especially the case when we're dealing with a
672          * TIME_WAIT ack, because the sk structure is long gone, and only
673          * the tcp_timewait_sock remains. So the md5 key is stashed in that
674          * structure, and we use it in preference.  I believe that (twsk ||
675          * skb->sk) holds true, but we program defensively.
676          */
677         if (!twsk && skb->sk) {
678                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
679         } else if (twsk && twsk->tw_md5_keylen) {
680                 tw_key.key = twsk->tw_md5_key;
681                 tw_key.keylen = twsk->tw_md5_keylen;
682                 key = &tw_key;
683         } else
684                 key = NULL;
685
686         if (key) {
687                 int offset = (ts) ? 3 : 0;
688
689                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
690                                           (TCPOPT_NOP << 16) |
691                                           (TCPOPT_MD5SIG << 8) |
692                                           TCPOLEN_MD5SIG);
693                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
694                 rep.th.doff = arg.iov[0].iov_len/4;
695
696                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
697                                         key,
698                                         ip_hdr(skb)->daddr,
699                                         ip_hdr(skb)->saddr,
700                                         &rep.th, IPPROTO_TCP,
701                                         arg.iov[0].iov_len);
702         }
703 #endif
704         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
705                                       ip_hdr(skb)->saddr, /* XXX */
706                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
707         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
708         if (twsk)
709                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
710
711         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
712
713         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
714 }
715
716 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
717 {
718         struct inet_timewait_sock *tw = inet_twsk(sk);
719         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
720
721         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
722                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
723                         tcptw->tw_ts_recent);
724
725         inet_twsk_put(tw);
726 }
727
728 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
729                                   struct request_sock *req)
730 {
731         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
732                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
733                         req->ts_recent);
734 }
735
736 /*
737  *      Send a SYN-ACK after having received an ACK.
738  *      This still operates on a request_sock only, not on a big
739  *      socket.
740  */
741 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
742                               struct dst_entry *dst)
743 {
744         const struct inet_request_sock *ireq = inet_rsk(req);
745         int err = -1;
746         struct sk_buff * skb;
747
748         /* First, grab a route. */
749         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
750                 goto out;
751
752         skb = tcp_make_synack(sk, dst, req);
753
754         if (skb) {
755                 struct tcphdr *th = tcp_hdr(skb);
756
757                 th->check = tcp_v4_check(skb->len,
758                                          ireq->loc_addr,
759                                          ireq->rmt_addr,
760                                          csum_partial((char *)th, skb->len,
761                                                       skb->csum));
762
763                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
764                                             ireq->rmt_addr,
765                                             ireq->opt);
766                 err = net_xmit_eval(err);
767         }
768
769 out:
770         dst_release(dst);
771         return err;
772 }
773
774 /*
775  *      IPv4 request_sock destructor.
776  */
777 static void tcp_v4_reqsk_destructor(struct request_sock *req)
778 {
779         kfree(inet_rsk(req)->opt);
780 }
781
782 #ifdef CONFIG_SYN_COOKIES
783 static void syn_flood_warning(struct sk_buff *skb)
784 {
785         static unsigned long warntime;
786
787         if (time_after(jiffies, (warntime + HZ * 60))) {
788                 warntime = jiffies;
789                 printk(KERN_INFO
790                        "possible SYN flooding on port %d. Sending cookies.\n",
791                        ntohs(tcp_hdr(skb)->dest));
792         }
793 }
794 #endif
795
796 /*
797  * Save and compile IPv4 options into the request_sock if needed.
798  */
799 static struct ip_options *tcp_v4_save_options(struct sock *sk,
800                                               struct sk_buff *skb)
801 {
802         struct ip_options *opt = &(IPCB(skb)->opt);
803         struct ip_options *dopt = NULL;
804
805         if (opt && opt->optlen) {
806                 int opt_size = optlength(opt);
807                 dopt = kmalloc(opt_size, GFP_ATOMIC);
808                 if (dopt) {
809                         if (ip_options_echo(dopt, skb)) {
810                                 kfree(dopt);
811                                 dopt = NULL;
812                         }
813                 }
814         }
815         return dopt;
816 }
817
818 #ifdef CONFIG_TCP_MD5SIG
819 /*
820  * RFC2385 MD5 checksumming requires a mapping of
821  * IP address->MD5 Key.
822  * We need to maintain these in the sk structure.
823  */
824
825 /* Find the Key structure for an address.  */
826 static struct tcp_md5sig_key *
827                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
828 {
829         struct tcp_sock *tp = tcp_sk(sk);
830         int i;
831
832         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
833                 return NULL;
834         for (i = 0; i < tp->md5sig_info->entries4; i++) {
835                 if (tp->md5sig_info->keys4[i].addr == addr)
836                         return (struct tcp_md5sig_key *)
837                                                 &tp->md5sig_info->keys4[i];
838         }
839         return NULL;
840 }
841
842 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
843                                          struct sock *addr_sk)
844 {
845         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
846 }
847
848 EXPORT_SYMBOL(tcp_v4_md5_lookup);
849
850 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
851                                                       struct request_sock *req)
852 {
853         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
854 }
855
856 /* This can be called on a newly created socket, from other files */
857 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
858                       u8 *newkey, u8 newkeylen)
859 {
860         /* Add Key to the list */
861         struct tcp4_md5sig_key *key;
862         struct tcp_sock *tp = tcp_sk(sk);
863         struct tcp4_md5sig_key *keys;
864
865         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
866         if (key) {
867                 /* Pre-existing entry - just update that one. */
868                 kfree(key->key);
869                 key->key = newkey;
870                 key->keylen = newkeylen;
871         } else {
872                 struct tcp_md5sig_info *md5sig;
873
874                 if (!tp->md5sig_info) {
875                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
876                                                   GFP_ATOMIC);
877                         if (!tp->md5sig_info) {
878                                 kfree(newkey);
879                                 return -ENOMEM;
880                         }
881                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
882                 }
883                 if (tcp_alloc_md5sig_pool() == NULL) {
884                         kfree(newkey);
885                         return -ENOMEM;
886                 }
887                 md5sig = tp->md5sig_info;
888
889                 if (md5sig->alloced4 == md5sig->entries4) {
890                         keys = kmalloc((sizeof(*keys) *
891                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
892                         if (!keys) {
893                                 kfree(newkey);
894                                 tcp_free_md5sig_pool();
895                                 return -ENOMEM;
896                         }
897
898                         if (md5sig->entries4)
899                                 memcpy(keys, md5sig->keys4,
900                                        sizeof(*keys) * md5sig->entries4);
901
902                         /* Free old key list, and reference new one */
903                         if (md5sig->keys4)
904                                 kfree(md5sig->keys4);
905                         md5sig->keys4 = keys;
906                         md5sig->alloced4++;
907                 }
908                 md5sig->entries4++;
909                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
910                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
911                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
912         }
913         return 0;
914 }
915
916 EXPORT_SYMBOL(tcp_v4_md5_do_add);
917
918 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
919                                u8 *newkey, u8 newkeylen)
920 {
921         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
922                                  newkey, newkeylen);
923 }
924
925 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
926 {
927         struct tcp_sock *tp = tcp_sk(sk);
928         int i;
929
930         for (i = 0; i < tp->md5sig_info->entries4; i++) {
931                 if (tp->md5sig_info->keys4[i].addr == addr) {
932                         /* Free the key */
933                         kfree(tp->md5sig_info->keys4[i].key);
934                         tp->md5sig_info->entries4--;
935
936                         if (tp->md5sig_info->entries4 == 0) {
937                                 kfree(tp->md5sig_info->keys4);
938                                 tp->md5sig_info->keys4 = NULL;
939                                 tp->md5sig_info->alloced4 = 0;
940                         } else if (tp->md5sig_info->entries4 != i) {
941                                 /* Need to do some manipulation */
942                                 memcpy(&tp->md5sig_info->keys4[i],
943                                        &tp->md5sig_info->keys4[i+1],
944                                        (tp->md5sig_info->entries4 - i) *
945                                         sizeof(struct tcp4_md5sig_key));
946                         }
947                         tcp_free_md5sig_pool();
948                         return 0;
949                 }
950         }
951         return -ENOENT;
952 }
953
954 EXPORT_SYMBOL(tcp_v4_md5_do_del);
955
956 static void tcp_v4_clear_md5_list(struct sock *sk)
957 {
958         struct tcp_sock *tp = tcp_sk(sk);
959
960         /* Free each key, then the set of key keys,
961          * the crypto element, and then decrement our
962          * hold on the last resort crypto.
963          */
964         if (tp->md5sig_info->entries4) {
965                 int i;
966                 for (i = 0; i < tp->md5sig_info->entries4; i++)
967                         kfree(tp->md5sig_info->keys4[i].key);
968                 tp->md5sig_info->entries4 = 0;
969                 tcp_free_md5sig_pool();
970         }
971         if (tp->md5sig_info->keys4) {
972                 kfree(tp->md5sig_info->keys4);
973                 tp->md5sig_info->keys4 = NULL;
974                 tp->md5sig_info->alloced4  = 0;
975         }
976 }
977
978 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
979                                  int optlen)
980 {
981         struct tcp_md5sig cmd;
982         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
983         u8 *newkey;
984
985         if (optlen < sizeof(cmd))
986                 return -EINVAL;
987
988         if (copy_from_user(&cmd, optval, sizeof(cmd)))
989                 return -EFAULT;
990
991         if (sin->sin_family != AF_INET)
992                 return -EINVAL;
993
994         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
995                 if (!tcp_sk(sk)->md5sig_info)
996                         return -ENOENT;
997                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
998         }
999
1000         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1001                 return -EINVAL;
1002
1003         if (!tcp_sk(sk)->md5sig_info) {
1004                 struct tcp_sock *tp = tcp_sk(sk);
1005                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1006
1007                 if (!p)
1008                         return -EINVAL;
1009
1010                 tp->md5sig_info = p;
1011                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1012         }
1013
1014         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1015         if (!newkey)
1016                 return -ENOMEM;
1017         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1018                                  newkey, cmd.tcpm_keylen);
1019 }
1020
1021 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1022                                    __be32 saddr, __be32 daddr,
1023                                    struct tcphdr *th, int protocol,
1024                                    int tcplen)
1025 {
1026         struct scatterlist sg[4];
1027         __u16 data_len;
1028         int block = 0;
1029         __sum16 old_checksum;
1030         struct tcp_md5sig_pool *hp;
1031         struct tcp4_pseudohdr *bp;
1032         struct hash_desc *desc;
1033         int err;
1034         unsigned int nbytes = 0;
1035
1036         /*
1037          * Okay, so RFC2385 is turned on for this connection,
1038          * so we need to generate the MD5 hash for the packet now.
1039          */
1040
1041         hp = tcp_get_md5sig_pool();
1042         if (!hp)
1043                 goto clear_hash_noput;
1044
1045         bp = &hp->md5_blk.ip4;
1046         desc = &hp->md5_desc;
1047
1048         /*
1049          * 1. the TCP pseudo-header (in the order: source IP address,
1050          * destination IP address, zero-padded protocol number, and
1051          * segment length)
1052          */
1053         bp->saddr = saddr;
1054         bp->daddr = daddr;
1055         bp->pad = 0;
1056         bp->protocol = protocol;
1057         bp->len = htons(tcplen);
1058         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1059         nbytes += sizeof(*bp);
1060
1061         /* 2. the TCP header, excluding options, and assuming a
1062          * checksum of zero/
1063          */
1064         old_checksum = th->check;
1065         th->check = 0;
1066         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1067         nbytes += sizeof(struct tcphdr);
1068
1069         /* 3. the TCP segment data (if any) */
1070         data_len = tcplen - (th->doff << 2);
1071         if (data_len > 0) {
1072                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1073                 sg_set_buf(&sg[block++], data, data_len);
1074                 nbytes += data_len;
1075         }
1076
1077         /* 4. an independently-specified key or password, known to both
1078          * TCPs and presumably connection-specific
1079          */
1080         sg_set_buf(&sg[block++], key->key, key->keylen);
1081         nbytes += key->keylen;
1082
1083         /* Now store the Hash into the packet */
1084         err = crypto_hash_init(desc);
1085         if (err)
1086                 goto clear_hash;
1087         err = crypto_hash_update(desc, sg, nbytes);
1088         if (err)
1089                 goto clear_hash;
1090         err = crypto_hash_final(desc, md5_hash);
1091         if (err)
1092                 goto clear_hash;
1093
1094         /* Reset header, and free up the crypto */
1095         tcp_put_md5sig_pool();
1096         th->check = old_checksum;
1097
1098 out:
1099         return 0;
1100 clear_hash:
1101         tcp_put_md5sig_pool();
1102 clear_hash_noput:
1103         memset(md5_hash, 0, 16);
1104         goto out;
1105 }
1106
1107 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1108                          struct sock *sk,
1109                          struct dst_entry *dst,
1110                          struct request_sock *req,
1111                          struct tcphdr *th, int protocol,
1112                          int tcplen)
1113 {
1114         __be32 saddr, daddr;
1115
1116         if (sk) {
1117                 saddr = inet_sk(sk)->saddr;
1118                 daddr = inet_sk(sk)->daddr;
1119         } else {
1120                 struct rtable *rt = (struct rtable *)dst;
1121                 BUG_ON(!rt);
1122                 saddr = rt->rt_src;
1123                 daddr = rt->rt_dst;
1124         }
1125         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1126                                        saddr, daddr,
1127                                        th, protocol, tcplen);
1128 }
1129
1130 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1131
1132 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1133 {
1134         /*
1135          * This gets called for each TCP segment that arrives
1136          * so we want to be efficient.
1137          * We have 3 drop cases:
1138          * o No MD5 hash and one expected.
1139          * o MD5 hash and we're not expecting one.
1140          * o MD5 hash and its wrong.
1141          */
1142         __u8 *hash_location = NULL;
1143         struct tcp_md5sig_key *hash_expected;
1144         const struct iphdr *iph = ip_hdr(skb);
1145         struct tcphdr *th = tcp_hdr(skb);
1146         int length = (th->doff << 2) - sizeof(struct tcphdr);
1147         int genhash;
1148         unsigned char *ptr;
1149         unsigned char newhash[16];
1150
1151         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1152
1153         /*
1154          * If the TCP option length is less than the TCP_MD5SIG
1155          * option length, then we can shortcut
1156          */
1157         if (length < TCPOLEN_MD5SIG) {
1158                 if (hash_expected)
1159                         return 1;
1160                 else
1161                         return 0;
1162         }
1163
1164         /* Okay, we can't shortcut - we have to grub through the options */
1165         ptr = (unsigned char *)(th + 1);
1166         while (length > 0) {
1167                 int opcode = *ptr++;
1168                 int opsize;
1169
1170                 switch (opcode) {
1171                 case TCPOPT_EOL:
1172                         goto done_opts;
1173                 case TCPOPT_NOP:
1174                         length--;
1175                         continue;
1176                 default:
1177                         opsize = *ptr++;
1178                         if (opsize < 2)
1179                                 goto done_opts;
1180                         if (opsize > length)
1181                                 goto done_opts;
1182
1183                         if (opcode == TCPOPT_MD5SIG) {
1184                                 hash_location = ptr;
1185                                 goto done_opts;
1186                         }
1187                 }
1188                 ptr += opsize-2;
1189                 length -= opsize;
1190         }
1191 done_opts:
1192         /* We've parsed the options - do we have a hash? */
1193         if (!hash_expected && !hash_location)
1194                 return 0;
1195
1196         if (hash_expected && !hash_location) {
1197                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1198                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1199                                NIPQUAD(iph->saddr), ntohs(th->source),
1200                                NIPQUAD(iph->daddr), ntohs(th->dest));
1201                 return 1;
1202         }
1203
1204         if (!hash_expected && hash_location) {
1205                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1206                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1207                                NIPQUAD(iph->saddr), ntohs(th->source),
1208                                NIPQUAD(iph->daddr), ntohs(th->dest));
1209                 return 1;
1210         }
1211
1212         /* Okay, so this is hash_expected and hash_location -
1213          * so we need to calculate the checksum.
1214          */
1215         genhash = tcp_v4_do_calc_md5_hash(newhash,
1216                                           hash_expected,
1217                                           iph->saddr, iph->daddr,
1218                                           th, sk->sk_protocol,
1219                                           skb->len);
1220
1221         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1222                 if (net_ratelimit()) {
1223                         printk(KERN_INFO "MD5 Hash failed for "
1224                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1225                                NIPQUAD(iph->saddr), ntohs(th->source),
1226                                NIPQUAD(iph->daddr), ntohs(th->dest),
1227                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1228                 }
1229                 return 1;
1230         }
1231         return 0;
1232 }
1233
1234 #endif
1235
1236 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1237         .family         =       PF_INET,
1238         .obj_size       =       sizeof(struct tcp_request_sock),
1239         .rtx_syn_ack    =       tcp_v4_send_synack,
1240         .send_ack       =       tcp_v4_reqsk_send_ack,
1241         .destructor     =       tcp_v4_reqsk_destructor,
1242         .send_reset     =       tcp_v4_send_reset,
1243 };
1244
1245 #ifdef CONFIG_TCP_MD5SIG
1246 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1247         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1248 };
1249 #endif
1250
1251 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1252         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1253         .twsk_unique    = tcp_twsk_unique,
1254         .twsk_destructor= tcp_twsk_destructor,
1255 };
1256
1257 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1258 {
1259         struct inet_request_sock *ireq;
1260         struct tcp_options_received tmp_opt;
1261         struct request_sock *req;
1262         __be32 saddr = ip_hdr(skb)->saddr;
1263         __be32 daddr = ip_hdr(skb)->daddr;
1264         __u32 isn = TCP_SKB_CB(skb)->when;
1265         struct dst_entry *dst = NULL;
1266 #ifdef CONFIG_SYN_COOKIES
1267         int want_cookie = 0;
1268 #else
1269 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1270 #endif
1271
1272         /* Never answer to SYNs send to broadcast or multicast */
1273         if (((struct rtable *)skb->dst)->rt_flags &
1274             (RTCF_BROADCAST | RTCF_MULTICAST))
1275                 goto drop;
1276
1277         /* TW buckets are converted to open requests without
1278          * limitations, they conserve resources and peer is
1279          * evidently real one.
1280          */
1281         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1282 #ifdef CONFIG_SYN_COOKIES
1283                 if (sysctl_tcp_syncookies) {
1284                         want_cookie = 1;
1285                 } else
1286 #endif
1287                 goto drop;
1288         }
1289
1290         /* Accept backlog is full. If we have already queued enough
1291          * of warm entries in syn queue, drop request. It is better than
1292          * clogging syn queue with openreqs with exponentially increasing
1293          * timeout.
1294          */
1295         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1296                 goto drop;
1297
1298         req = reqsk_alloc(&tcp_request_sock_ops);
1299         if (!req)
1300                 goto drop;
1301
1302 #ifdef CONFIG_TCP_MD5SIG
1303         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1304 #endif
1305
1306         tcp_clear_options(&tmp_opt);
1307         tmp_opt.mss_clamp = 536;
1308         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1309
1310         tcp_parse_options(skb, &tmp_opt, 0);
1311
1312         if (want_cookie) {
1313                 tcp_clear_options(&tmp_opt);
1314                 tmp_opt.saw_tstamp = 0;
1315         }
1316
1317         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1318                 /* Some OSes (unknown ones, but I see them on web server, which
1319                  * contains information interesting only for windows'
1320                  * users) do not send their stamp in SYN. It is easy case.
1321                  * We simply do not advertise TS support.
1322                  */
1323                 tmp_opt.saw_tstamp = 0;
1324                 tmp_opt.tstamp_ok  = 0;
1325         }
1326         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1327
1328         tcp_openreq_init(req, &tmp_opt, skb);
1329
1330         if (security_inet_conn_request(sk, skb, req))
1331                 goto drop_and_free;
1332
1333         ireq = inet_rsk(req);
1334         ireq->loc_addr = daddr;
1335         ireq->rmt_addr = saddr;
1336         ireq->opt = tcp_v4_save_options(sk, skb);
1337         if (!want_cookie)
1338                 TCP_ECN_create_request(req, tcp_hdr(skb));
1339
1340         if (want_cookie) {
1341 #ifdef CONFIG_SYN_COOKIES
1342                 syn_flood_warning(skb);
1343 #endif
1344                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1345         } else if (!isn) {
1346                 struct inet_peer *peer = NULL;
1347
1348                 /* VJ's idea. We save last timestamp seen
1349                  * from the destination in peer table, when entering
1350                  * state TIME-WAIT, and check against it before
1351                  * accepting new connection request.
1352                  *
1353                  * If "isn" is not zero, this request hit alive
1354                  * timewait bucket, so that all the necessary checks
1355                  * are made in the function processing timewait state.
1356                  */
1357                 if (tmp_opt.saw_tstamp &&
1358                     tcp_death_row.sysctl_tw_recycle &&
1359                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1360                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1361                     peer->v4daddr == saddr) {
1362                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1363                             (s32)(peer->tcp_ts - req->ts_recent) >
1364                                                         TCP_PAWS_WINDOW) {
1365                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1366                                 dst_release(dst);
1367                                 goto drop_and_free;
1368                         }
1369                 }
1370                 /* Kill the following clause, if you dislike this way. */
1371                 else if (!sysctl_tcp_syncookies &&
1372                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1373                           (sysctl_max_syn_backlog >> 2)) &&
1374                          (!peer || !peer->tcp_ts_stamp) &&
1375                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1376                         /* Without syncookies last quarter of
1377                          * backlog is filled with destinations,
1378                          * proven to be alive.
1379                          * It means that we continue to communicate
1380                          * to destinations, already remembered
1381                          * to the moment of synflood.
1382                          */
1383                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1384                                        "request from %u.%u.%u.%u/%u\n",
1385                                        NIPQUAD(saddr),
1386                                        ntohs(tcp_hdr(skb)->source));
1387                         dst_release(dst);
1388                         goto drop_and_free;
1389                 }
1390
1391                 isn = tcp_v4_init_sequence(skb);
1392         }
1393         tcp_rsk(req)->snt_isn = isn;
1394
1395         if (tcp_v4_send_synack(sk, req, dst))
1396                 goto drop_and_free;
1397
1398         if (want_cookie) {
1399                 reqsk_free(req);
1400         } else {
1401                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1402         }
1403         return 0;
1404
1405 drop_and_free:
1406         reqsk_free(req);
1407 drop:
1408         return 0;
1409 }
1410
1411
1412 /*
1413  * The three way handshake has completed - we got a valid synack -
1414  * now create the new socket.
1415  */
1416 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1417                                   struct request_sock *req,
1418                                   struct dst_entry *dst)
1419 {
1420         struct inet_request_sock *ireq;
1421         struct inet_sock *newinet;
1422         struct tcp_sock *newtp;
1423         struct sock *newsk;
1424 #ifdef CONFIG_TCP_MD5SIG
1425         struct tcp_md5sig_key *key;
1426 #endif
1427
1428         if (sk_acceptq_is_full(sk))
1429                 goto exit_overflow;
1430
1431         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1432                 goto exit;
1433
1434         newsk = tcp_create_openreq_child(sk, req, skb);
1435         if (!newsk)
1436                 goto exit;
1437
1438         newsk->sk_gso_type = SKB_GSO_TCPV4;
1439         sk_setup_caps(newsk, dst);
1440
1441         newtp                 = tcp_sk(newsk);
1442         newinet               = inet_sk(newsk);
1443         ireq                  = inet_rsk(req);
1444         newinet->daddr        = ireq->rmt_addr;
1445         newinet->rcv_saddr    = ireq->loc_addr;
1446         newinet->saddr        = ireq->loc_addr;
1447         newinet->opt          = ireq->opt;
1448         ireq->opt             = NULL;
1449         newinet->mc_index     = inet_iif(skb);
1450         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1451         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1452         if (newinet->opt)
1453                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1454         newinet->id = newtp->write_seq ^ jiffies;
1455
1456         tcp_mtup_init(newsk);
1457         tcp_sync_mss(newsk, dst_mtu(dst));
1458         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1459         tcp_initialize_rcv_mss(newsk);
1460
1461 #ifdef CONFIG_TCP_MD5SIG
1462         /* Copy over the MD5 key from the original socket */
1463         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1464                 /*
1465                  * We're using one, so create a matching key
1466                  * on the newsk structure. If we fail to get
1467                  * memory, then we end up not copying the key
1468                  * across. Shucks.
1469                  */
1470                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1471                 if (newkey != NULL)
1472                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1473                                           newkey, key->keylen);
1474         }
1475 #endif
1476
1477         __inet_hash(&tcp_hashinfo, newsk, 0);
1478         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1479
1480         return newsk;
1481
1482 exit_overflow:
1483         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1484 exit:
1485         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1486         dst_release(dst);
1487         return NULL;
1488 }
1489
1490 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1491 {
1492         struct tcphdr *th = tcp_hdr(skb);
1493         const struct iphdr *iph = ip_hdr(skb);
1494         struct sock *nsk;
1495         struct request_sock **prev;
1496         /* Find possible connection requests. */
1497         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1498                                                        iph->saddr, iph->daddr);
1499         if (req)
1500                 return tcp_check_req(sk, skb, req, prev);
1501
1502         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1503                                       iph->daddr, th->dest, inet_iif(skb));
1504
1505         if (nsk) {
1506                 if (nsk->sk_state != TCP_TIME_WAIT) {
1507                         bh_lock_sock(nsk);
1508                         return nsk;
1509                 }
1510                 inet_twsk_put(inet_twsk(nsk));
1511                 return NULL;
1512         }
1513
1514 #ifdef CONFIG_SYN_COOKIES
1515         if (!th->rst && !th->syn && th->ack)
1516                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1517 #endif
1518         return sk;
1519 }
1520
1521 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1522 {
1523         const struct iphdr *iph = ip_hdr(skb);
1524
1525         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1526                 if (!tcp_v4_check(skb->len, iph->saddr,
1527                                   iph->daddr, skb->csum)) {
1528                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1529                         return 0;
1530                 }
1531         }
1532
1533         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1534                                        skb->len, IPPROTO_TCP, 0);
1535
1536         if (skb->len <= 76) {
1537                 return __skb_checksum_complete(skb);
1538         }
1539         return 0;
1540 }
1541
1542
1543 /* The socket must have it's spinlock held when we get
1544  * here.
1545  *
1546  * We have a potential double-lock case here, so even when
1547  * doing backlog processing we use the BH locking scheme.
1548  * This is because we cannot sleep with the original spinlock
1549  * held.
1550  */
1551 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1552 {
1553         struct sock *rsk;
1554 #ifdef CONFIG_TCP_MD5SIG
1555         /*
1556          * We really want to reject the packet as early as possible
1557          * if:
1558          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1559          *  o There is an MD5 option and we're not expecting one
1560          */
1561         if (tcp_v4_inbound_md5_hash(sk, skb))
1562                 goto discard;
1563 #endif
1564
1565         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1566                 TCP_CHECK_TIMER(sk);
1567                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1568                         rsk = sk;
1569                         goto reset;
1570                 }
1571                 TCP_CHECK_TIMER(sk);
1572                 return 0;
1573         }
1574
1575         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1576                 goto csum_err;
1577
1578         if (sk->sk_state == TCP_LISTEN) {
1579                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1580                 if (!nsk)
1581                         goto discard;
1582
1583                 if (nsk != sk) {
1584                         if (tcp_child_process(sk, nsk, skb)) {
1585                                 rsk = nsk;
1586                                 goto reset;
1587                         }
1588                         return 0;
1589                 }
1590         }
1591
1592         TCP_CHECK_TIMER(sk);
1593         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1594                 rsk = sk;
1595                 goto reset;
1596         }
1597         TCP_CHECK_TIMER(sk);
1598         return 0;
1599
1600 reset:
1601         tcp_v4_send_reset(rsk, skb);
1602 discard:
1603         kfree_skb(skb);
1604         /* Be careful here. If this function gets more complicated and
1605          * gcc suffers from register pressure on the x86, sk (in %ebx)
1606          * might be destroyed here. This current version compiles correctly,
1607          * but you have been warned.
1608          */
1609         return 0;
1610
1611 csum_err:
1612         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1613         goto discard;
1614 }
1615
1616 /*
1617  *      From tcp_input.c
1618  */
1619
1620 int tcp_v4_rcv(struct sk_buff *skb)
1621 {
1622         const struct iphdr *iph;
1623         struct tcphdr *th;
1624         struct sock *sk;
1625         int ret;
1626
1627         if (skb->pkt_type != PACKET_HOST)
1628                 goto discard_it;
1629
1630         /* Count it even if it's bad */
1631         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1632
1633         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1634                 goto discard_it;
1635
1636         th = tcp_hdr(skb);
1637
1638         if (th->doff < sizeof(struct tcphdr) / 4)
1639                 goto bad_packet;
1640         if (!pskb_may_pull(skb, th->doff * 4))
1641                 goto discard_it;
1642
1643         /* An explanation is required here, I think.
1644          * Packet length and doff are validated by header prediction,
1645          * provided case of th->doff==0 is eliminated.
1646          * So, we defer the checks. */
1647         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1648                 goto bad_packet;
1649
1650         th = tcp_hdr(skb);
1651         iph = ip_hdr(skb);
1652         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1653         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1654                                     skb->len - th->doff * 4);
1655         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1656         TCP_SKB_CB(skb)->when    = 0;
1657         TCP_SKB_CB(skb)->flags   = iph->tos;
1658         TCP_SKB_CB(skb)->sacked  = 0;
1659
1660         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1661                            iph->daddr, th->dest, inet_iif(skb));
1662         if (!sk)
1663                 goto no_tcp_socket;
1664
1665 process:
1666         if (sk->sk_state == TCP_TIME_WAIT)
1667                 goto do_time_wait;
1668
1669         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1670                 goto discard_and_relse;
1671         nf_reset(skb);
1672
1673         if (sk_filter(sk, skb))
1674                 goto discard_and_relse;
1675
1676         skb->dev = NULL;
1677
1678         bh_lock_sock_nested(sk);
1679         ret = 0;
1680         if (!sock_owned_by_user(sk)) {
1681 #ifdef CONFIG_NET_DMA
1682                 struct tcp_sock *tp = tcp_sk(sk);
1683                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1684                         tp->ucopy.dma_chan = get_softnet_dma();
1685                 if (tp->ucopy.dma_chan)
1686                         ret = tcp_v4_do_rcv(sk, skb);
1687                 else
1688 #endif
1689                 {
1690                         if (!tcp_prequeue(sk, skb))
1691                         ret = tcp_v4_do_rcv(sk, skb);
1692                 }
1693         } else
1694                 sk_add_backlog(sk, skb);
1695         bh_unlock_sock(sk);
1696
1697         sock_put(sk);
1698
1699         return ret;
1700
1701 no_tcp_socket:
1702         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1703                 goto discard_it;
1704
1705         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1706 bad_packet:
1707                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1708         } else {
1709                 tcp_v4_send_reset(NULL, skb);
1710         }
1711
1712 discard_it:
1713         /* Discard frame. */
1714         kfree_skb(skb);
1715         return 0;
1716
1717 discard_and_relse:
1718         sock_put(sk);
1719         goto discard_it;
1720
1721 do_time_wait:
1722         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1723                 inet_twsk_put(inet_twsk(sk));
1724                 goto discard_it;
1725         }
1726
1727         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1728                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1729                 inet_twsk_put(inet_twsk(sk));
1730                 goto discard_it;
1731         }
1732         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1733         case TCP_TW_SYN: {
1734                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1735                                                         iph->daddr, th->dest,
1736                                                         inet_iif(skb));
1737                 if (sk2) {
1738                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1739                         inet_twsk_put(inet_twsk(sk));
1740                         sk = sk2;
1741                         goto process;
1742                 }
1743                 /* Fall through to ACK */
1744         }
1745         case TCP_TW_ACK:
1746                 tcp_v4_timewait_ack(sk, skb);
1747                 break;
1748         case TCP_TW_RST:
1749                 goto no_tcp_socket;
1750         case TCP_TW_SUCCESS:;
1751         }
1752         goto discard_it;
1753 }
1754
1755 /* VJ's idea. Save last timestamp seen from this destination
1756  * and hold it at least for normal timewait interval to use for duplicate
1757  * segment detection in subsequent connections, before they enter synchronized
1758  * state.
1759  */
1760
1761 int tcp_v4_remember_stamp(struct sock *sk)
1762 {
1763         struct inet_sock *inet = inet_sk(sk);
1764         struct tcp_sock *tp = tcp_sk(sk);
1765         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1766         struct inet_peer *peer = NULL;
1767         int release_it = 0;
1768
1769         if (!rt || rt->rt_dst != inet->daddr) {
1770                 peer = inet_getpeer(inet->daddr, 1);
1771                 release_it = 1;
1772         } else {
1773                 if (!rt->peer)
1774                         rt_bind_peer(rt, 1);
1775                 peer = rt->peer;
1776         }
1777
1778         if (peer) {
1779                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1780                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1781                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1782                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1783                         peer->tcp_ts = tp->rx_opt.ts_recent;
1784                 }
1785                 if (release_it)
1786                         inet_putpeer(peer);
1787                 return 1;
1788         }
1789
1790         return 0;
1791 }
1792
1793 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1794 {
1795         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1796
1797         if (peer) {
1798                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1799
1800                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1801                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1802                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1803                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1804                         peer->tcp_ts       = tcptw->tw_ts_recent;
1805                 }
1806                 inet_putpeer(peer);
1807                 return 1;
1808         }
1809
1810         return 0;
1811 }
1812
1813 struct inet_connection_sock_af_ops ipv4_specific = {
1814         .queue_xmit        = ip_queue_xmit,
1815         .send_check        = tcp_v4_send_check,
1816         .rebuild_header    = inet_sk_rebuild_header,
1817         .conn_request      = tcp_v4_conn_request,
1818         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1819         .remember_stamp    = tcp_v4_remember_stamp,
1820         .net_header_len    = sizeof(struct iphdr),
1821         .setsockopt        = ip_setsockopt,
1822         .getsockopt        = ip_getsockopt,
1823         .addr2sockaddr     = inet_csk_addr2sockaddr,
1824         .sockaddr_len      = sizeof(struct sockaddr_in),
1825 #ifdef CONFIG_COMPAT
1826         .compat_setsockopt = compat_ip_setsockopt,
1827         .compat_getsockopt = compat_ip_getsockopt,
1828 #endif
1829 };
1830
1831 #ifdef CONFIG_TCP_MD5SIG
1832 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1833         .md5_lookup             = tcp_v4_md5_lookup,
1834         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1835         .md5_add                = tcp_v4_md5_add_func,
1836         .md5_parse              = tcp_v4_parse_md5_keys,
1837 };
1838 #endif
1839
1840 /* NOTE: A lot of things set to zero explicitly by call to
1841  *       sk_alloc() so need not be done here.
1842  */
1843 static int tcp_v4_init_sock(struct sock *sk)
1844 {
1845         struct inet_connection_sock *icsk = inet_csk(sk);
1846         struct tcp_sock *tp = tcp_sk(sk);
1847
1848         skb_queue_head_init(&tp->out_of_order_queue);
1849         tcp_init_xmit_timers(sk);
1850         tcp_prequeue_init(tp);
1851
1852         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1853         tp->mdev = TCP_TIMEOUT_INIT;
1854
1855         /* So many TCP implementations out there (incorrectly) count the
1856          * initial SYN frame in their delayed-ACK and congestion control
1857          * algorithms that we must have the following bandaid to talk
1858          * efficiently to them.  -DaveM
1859          */
1860         tp->snd_cwnd = 2;
1861
1862         /* See draft-stevens-tcpca-spec-01 for discussion of the
1863          * initialization of these values.
1864          */
1865         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1866         tp->snd_cwnd_clamp = ~0;
1867         tp->mss_cache = 536;
1868
1869         tp->reordering = sysctl_tcp_reordering;
1870         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1871
1872         sk->sk_state = TCP_CLOSE;
1873
1874         sk->sk_write_space = sk_stream_write_space;
1875         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1876
1877         icsk->icsk_af_ops = &ipv4_specific;
1878         icsk->icsk_sync_mss = tcp_sync_mss;
1879 #ifdef CONFIG_TCP_MD5SIG
1880         tp->af_specific = &tcp_sock_ipv4_specific;
1881 #endif
1882
1883         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1884         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1885
1886         atomic_inc(&tcp_sockets_allocated);
1887
1888         return 0;
1889 }
1890
1891 int tcp_v4_destroy_sock(struct sock *sk)
1892 {
1893         struct tcp_sock *tp = tcp_sk(sk);
1894
1895         tcp_clear_xmit_timers(sk);
1896
1897         tcp_cleanup_congestion_control(sk);
1898
1899         /* Cleanup up the write buffer. */
1900         tcp_write_queue_purge(sk);
1901
1902         /* Cleans up our, hopefully empty, out_of_order_queue. */
1903         __skb_queue_purge(&tp->out_of_order_queue);
1904
1905 #ifdef CONFIG_TCP_MD5SIG
1906         /* Clean up the MD5 key list, if any */
1907         if (tp->md5sig_info) {
1908                 tcp_v4_clear_md5_list(sk);
1909                 kfree(tp->md5sig_info);
1910                 tp->md5sig_info = NULL;
1911         }
1912 #endif
1913
1914 #ifdef CONFIG_NET_DMA
1915         /* Cleans up our sk_async_wait_queue */
1916         __skb_queue_purge(&sk->sk_async_wait_queue);
1917 #endif
1918
1919         /* Clean prequeue, it must be empty really */
1920         __skb_queue_purge(&tp->ucopy.prequeue);
1921
1922         /* Clean up a referenced TCP bind bucket. */
1923         if (inet_csk(sk)->icsk_bind_hash)
1924                 inet_put_port(&tcp_hashinfo, sk);
1925
1926         /*
1927          * If sendmsg cached page exists, toss it.
1928          */
1929         if (sk->sk_sndmsg_page) {
1930                 __free_page(sk->sk_sndmsg_page);
1931                 sk->sk_sndmsg_page = NULL;
1932         }
1933
1934         atomic_dec(&tcp_sockets_allocated);
1935
1936         return 0;
1937 }
1938
1939 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1940
1941 #ifdef CONFIG_PROC_FS
1942 /* Proc filesystem TCP sock list dumping. */
1943
1944 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1945 {
1946         return hlist_empty(head) ? NULL :
1947                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1948 }
1949
1950 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1951 {
1952         return tw->tw_node.next ?
1953                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1954 }
1955
1956 static void *listening_get_next(struct seq_file *seq, void *cur)
1957 {
1958         struct inet_connection_sock *icsk;
1959         struct hlist_node *node;
1960         struct sock *sk = cur;
1961         struct tcp_iter_state* st = seq->private;
1962
1963         if (!sk) {
1964                 st->bucket = 0;
1965                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1966                 goto get_sk;
1967         }
1968
1969         ++st->num;
1970
1971         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1972                 struct request_sock *req = cur;
1973
1974                 icsk = inet_csk(st->syn_wait_sk);
1975                 req = req->dl_next;
1976                 while (1) {
1977                         while (req) {
1978                                 if (req->rsk_ops->family == st->family) {
1979                                         cur = req;
1980                                         goto out;
1981                                 }
1982                                 req = req->dl_next;
1983                         }
1984                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1985                                 break;
1986 get_req:
1987                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1988                 }
1989                 sk        = sk_next(st->syn_wait_sk);
1990                 st->state = TCP_SEQ_STATE_LISTENING;
1991                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992         } else {
1993                 icsk = inet_csk(sk);
1994                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1995                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1996                         goto start_req;
1997                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1998                 sk = sk_next(sk);
1999         }
2000 get_sk:
2001         sk_for_each_from(sk, node) {
2002                 if (sk->sk_family == st->family) {
2003                         cur = sk;
2004                         goto out;
2005                 }
2006                 icsk = inet_csk(sk);
2007                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2009 start_req:
2010                         st->uid         = sock_i_uid(sk);
2011                         st->syn_wait_sk = sk;
2012                         st->state       = TCP_SEQ_STATE_OPENREQ;
2013                         st->sbucket     = 0;
2014                         goto get_req;
2015                 }
2016                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2017         }
2018         if (++st->bucket < INET_LHTABLE_SIZE) {
2019                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2020                 goto get_sk;
2021         }
2022         cur = NULL;
2023 out:
2024         return cur;
2025 }
2026
2027 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2028 {
2029         void *rc = listening_get_next(seq, NULL);
2030
2031         while (rc && *pos) {
2032                 rc = listening_get_next(seq, rc);
2033                 --*pos;
2034         }
2035         return rc;
2036 }
2037
2038 static void *established_get_first(struct seq_file *seq)
2039 {
2040         struct tcp_iter_state* st = seq->private;
2041         void *rc = NULL;
2042
2043         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2044                 struct sock *sk;
2045                 struct hlist_node *node;
2046                 struct inet_timewait_sock *tw;
2047
2048                 /* We can reschedule _before_ having picked the target: */
2049                 cond_resched_softirq();
2050
2051                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2052                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2053                         if (sk->sk_family != st->family) {
2054                                 continue;
2055                         }
2056                         rc = sk;
2057                         goto out;
2058                 }
2059                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2060                 inet_twsk_for_each(tw, node,
2061                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2062                         if (tw->tw_family != st->family) {
2063                                 continue;
2064                         }
2065                         rc = tw;
2066                         goto out;
2067                 }
2068                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2069                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2070         }
2071 out:
2072         return rc;
2073 }
2074
2075 static void *established_get_next(struct seq_file *seq, void *cur)
2076 {
2077         struct sock *sk = cur;
2078         struct inet_timewait_sock *tw;
2079         struct hlist_node *node;
2080         struct tcp_iter_state* st = seq->private;
2081
2082         ++st->num;
2083
2084         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2085                 tw = cur;
2086                 tw = tw_next(tw);
2087 get_tw:
2088                 while (tw && tw->tw_family != st->family) {
2089                         tw = tw_next(tw);
2090                 }
2091                 if (tw) {
2092                         cur = tw;
2093                         goto out;
2094                 }
2095                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2096                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2097
2098                 /* We can reschedule between buckets: */
2099                 cond_resched_softirq();
2100
2101                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2102                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2103                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2104                 } else {
2105                         cur = NULL;
2106                         goto out;
2107                 }
2108         } else
2109                 sk = sk_next(sk);
2110
2111         sk_for_each_from(sk, node) {
2112                 if (sk->sk_family == st->family)
2113                         goto found;
2114         }
2115
2116         st->state = TCP_SEQ_STATE_TIME_WAIT;
2117         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2118         goto get_tw;
2119 found:
2120         cur = sk;
2121 out:
2122         return cur;
2123 }
2124
2125 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2126 {
2127         void *rc = established_get_first(seq);
2128
2129         while (rc && pos) {
2130                 rc = established_get_next(seq, rc);
2131                 --pos;
2132         }
2133         return rc;
2134 }
2135
2136 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2137 {
2138         void *rc;
2139         struct tcp_iter_state* st = seq->private;
2140
2141         inet_listen_lock(&tcp_hashinfo);
2142         st->state = TCP_SEQ_STATE_LISTENING;
2143         rc        = listening_get_idx(seq, &pos);
2144
2145         if (!rc) {
2146                 inet_listen_unlock(&tcp_hashinfo);
2147                 local_bh_disable();
2148                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2149                 rc        = established_get_idx(seq, pos);
2150         }
2151
2152         return rc;
2153 }
2154
2155 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2156 {
2157         struct tcp_iter_state* st = seq->private;
2158         st->state = TCP_SEQ_STATE_LISTENING;
2159         st->num = 0;
2160         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2161 }
2162
2163 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2164 {
2165         void *rc = NULL;
2166         struct tcp_iter_state* st;
2167
2168         if (v == SEQ_START_TOKEN) {
2169                 rc = tcp_get_idx(seq, 0);
2170                 goto out;
2171         }
2172         st = seq->private;
2173
2174         switch (st->state) {
2175         case TCP_SEQ_STATE_OPENREQ:
2176         case TCP_SEQ_STATE_LISTENING:
2177                 rc = listening_get_next(seq, v);
2178                 if (!rc) {
2179                         inet_listen_unlock(&tcp_hashinfo);
2180                         local_bh_disable();
2181                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2182                         rc        = established_get_first(seq);
2183                 }
2184                 break;
2185         case TCP_SEQ_STATE_ESTABLISHED:
2186         case TCP_SEQ_STATE_TIME_WAIT:
2187                 rc = established_get_next(seq, v);
2188                 break;
2189         }
2190 out:
2191         ++*pos;
2192         return rc;
2193 }
2194
2195 static void tcp_seq_stop(struct seq_file *seq, void *v)
2196 {
2197         struct tcp_iter_state* st = seq->private;
2198
2199         switch (st->state) {
2200         case TCP_SEQ_STATE_OPENREQ:
2201                 if (v) {
2202                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2203                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2204                 }
2205         case TCP_SEQ_STATE_LISTENING:
2206                 if (v != SEQ_START_TOKEN)
2207                         inet_listen_unlock(&tcp_hashinfo);
2208                 break;
2209         case TCP_SEQ_STATE_TIME_WAIT:
2210         case TCP_SEQ_STATE_ESTABLISHED:
2211                 if (v)
2212                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2213                 local_bh_enable();
2214                 break;
2215         }
2216 }
2217
2218 static int tcp_seq_open(struct inode *inode, struct file *file)
2219 {
2220         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2221         struct seq_file *seq;
2222         struct tcp_iter_state *s;
2223         int rc;
2224
2225         if (unlikely(afinfo == NULL))
2226                 return -EINVAL;
2227
2228         s = kzalloc(sizeof(*s), GFP_KERNEL);
2229         if (!s)
2230                 return -ENOMEM;
2231         s->family               = afinfo->family;
2232         s->seq_ops.start        = tcp_seq_start;
2233         s->seq_ops.next         = tcp_seq_next;
2234         s->seq_ops.show         = afinfo->seq_show;
2235         s->seq_ops.stop         = tcp_seq_stop;
2236
2237         rc = seq_open(file, &s->seq_ops);
2238         if (rc)
2239                 goto out_kfree;
2240         seq          = file->private_data;
2241         seq->private = s;
2242 out:
2243         return rc;
2244 out_kfree:
2245         kfree(s);
2246         goto out;
2247 }
2248
2249 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2250 {
2251         int rc = 0;
2252         struct proc_dir_entry *p;
2253
2254         if (!afinfo)
2255                 return -EINVAL;
2256         afinfo->seq_fops->owner         = afinfo->owner;
2257         afinfo->seq_fops->open          = tcp_seq_open;
2258         afinfo->seq_fops->read          = seq_read;
2259         afinfo->seq_fops->llseek        = seq_lseek;
2260         afinfo->seq_fops->release       = seq_release_private;
2261
2262         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2263         if (p)
2264                 p->data = afinfo;
2265         else
2266                 rc = -ENOMEM;
2267         return rc;
2268 }
2269
2270 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2271 {
2272         if (!afinfo)
2273                 return;
2274         proc_net_remove(afinfo->name);
2275         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2276 }
2277
2278 static void get_openreq4(struct sock *sk, struct request_sock *req,
2279                          char *tmpbuf, int i, int uid)
2280 {
2281         const struct inet_request_sock *ireq = inet_rsk(req);
2282         int ttd = req->expires - jiffies;
2283
2284         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2285                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2286                 i,
2287                 ireq->loc_addr,
2288                 ntohs(inet_sk(sk)->sport),
2289                 ireq->rmt_addr,
2290                 ntohs(ireq->rmt_port),
2291                 TCP_SYN_RECV,
2292                 0, 0, /* could print option size, but that is af dependent. */
2293                 1,    /* timers active (only the expire timer) */
2294                 jiffies_to_clock_t(ttd),
2295                 req->retrans,
2296                 uid,
2297                 0,  /* non standard timer */
2298                 0, /* open_requests have no inode */
2299                 atomic_read(&sk->sk_refcnt),
2300                 req);
2301 }
2302
2303 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2304 {
2305         int timer_active;
2306         unsigned long timer_expires;
2307         struct tcp_sock *tp = tcp_sk(sk);
2308         const struct inet_connection_sock *icsk = inet_csk(sk);
2309         struct inet_sock *inet = inet_sk(sk);
2310         __be32 dest = inet->daddr;
2311         __be32 src = inet->rcv_saddr;
2312         __u16 destp = ntohs(inet->dport);
2313         __u16 srcp = ntohs(inet->sport);
2314
2315         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2316                 timer_active    = 1;
2317                 timer_expires   = icsk->icsk_timeout;
2318         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2319                 timer_active    = 4;
2320                 timer_expires   = icsk->icsk_timeout;
2321         } else if (timer_pending(&sk->sk_timer)) {
2322                 timer_active    = 2;
2323                 timer_expires   = sk->sk_timer.expires;
2324         } else {
2325                 timer_active    = 0;
2326                 timer_expires = jiffies;
2327         }
2328
2329         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2330                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2331                 i, src, srcp, dest, destp, sk->sk_state,
2332                 tp->write_seq - tp->snd_una,
2333                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2334                                              (tp->rcv_nxt - tp->copied_seq),
2335                 timer_active,
2336                 jiffies_to_clock_t(timer_expires - jiffies),
2337                 icsk->icsk_retransmits,
2338                 sock_i_uid(sk),
2339                 icsk->icsk_probes_out,
2340                 sock_i_ino(sk),
2341                 atomic_read(&sk->sk_refcnt), sk,
2342                 icsk->icsk_rto,
2343                 icsk->icsk_ack.ato,
2344                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2345                 tp->snd_cwnd,
2346                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2347 }
2348
2349 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2350                                char *tmpbuf, int i)
2351 {
2352         __be32 dest, src;
2353         __u16 destp, srcp;
2354         int ttd = tw->tw_ttd - jiffies;
2355
2356         if (ttd < 0)
2357                 ttd = 0;
2358
2359         dest  = tw->tw_daddr;
2360         src   = tw->tw_rcv_saddr;
2361         destp = ntohs(tw->tw_dport);
2362         srcp  = ntohs(tw->tw_sport);
2363
2364         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2365                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2366                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2367                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2368                 atomic_read(&tw->tw_refcnt), tw);
2369 }
2370
2371 #define TMPSZ 150
2372
2373 static int tcp4_seq_show(struct seq_file *seq, void *v)
2374 {
2375         struct tcp_iter_state* st;
2376         char tmpbuf[TMPSZ + 1];
2377
2378         if (v == SEQ_START_TOKEN) {
2379                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2380                            "  sl  local_address rem_address   st tx_queue "
2381                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2382                            "inode");
2383                 goto out;
2384         }
2385         st = seq->private;
2386
2387         switch (st->state) {
2388         case TCP_SEQ_STATE_LISTENING:
2389         case TCP_SEQ_STATE_ESTABLISHED:
2390                 get_tcp4_sock(v, tmpbuf, st->num);
2391                 break;
2392         case TCP_SEQ_STATE_OPENREQ:
2393                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2394                 break;
2395         case TCP_SEQ_STATE_TIME_WAIT:
2396                 get_timewait4_sock(v, tmpbuf, st->num);
2397                 break;
2398         }
2399         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2400 out:
2401         return 0;
2402 }
2403
2404 static struct file_operations tcp4_seq_fops;
2405 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2406         .owner          = THIS_MODULE,
2407         .name           = "tcp",
2408         .family         = AF_INET,
2409         .seq_show       = tcp4_seq_show,
2410         .seq_fops       = &tcp4_seq_fops,
2411 };
2412
2413 int __init tcp4_proc_init(void)
2414 {
2415         return tcp_proc_register(&tcp4_seq_afinfo);
2416 }
2417
2418 void tcp4_proc_exit(void)
2419 {
2420         tcp_proc_unregister(&tcp4_seq_afinfo);
2421 }
2422 #endif /* CONFIG_PROC_FS */
2423
2424 struct proto tcp_prot = {
2425         .name                   = "TCP",
2426         .owner                  = THIS_MODULE,
2427         .close                  = tcp_close,
2428         .connect                = tcp_v4_connect,
2429         .disconnect             = tcp_disconnect,
2430         .accept                 = inet_csk_accept,
2431         .ioctl                  = tcp_ioctl,
2432         .init                   = tcp_v4_init_sock,
2433         .destroy                = tcp_v4_destroy_sock,
2434         .shutdown               = tcp_shutdown,
2435         .setsockopt             = tcp_setsockopt,
2436         .getsockopt             = tcp_getsockopt,
2437         .sendmsg                = tcp_sendmsg,
2438         .recvmsg                = tcp_recvmsg,
2439         .backlog_rcv            = tcp_v4_do_rcv,
2440         .hash                   = tcp_v4_hash,
2441         .unhash                 = tcp_unhash,
2442         .get_port               = tcp_v4_get_port,
2443         .enter_memory_pressure  = tcp_enter_memory_pressure,
2444         .sockets_allocated      = &tcp_sockets_allocated,
2445         .orphan_count           = &tcp_orphan_count,
2446         .memory_allocated       = &tcp_memory_allocated,
2447         .memory_pressure        = &tcp_memory_pressure,
2448         .sysctl_mem             = sysctl_tcp_mem,
2449         .sysctl_wmem            = sysctl_tcp_wmem,
2450         .sysctl_rmem            = sysctl_tcp_rmem,
2451         .max_header             = MAX_TCP_HEADER,
2452         .obj_size               = sizeof(struct tcp_sock),
2453         .twsk_prot              = &tcp_timewait_sock_ops,
2454         .rsk_prot               = &tcp_request_sock_ops,
2455 #ifdef CONFIG_COMPAT
2456         .compat_setsockopt      = compat_tcp_setsockopt,
2457         .compat_getsockopt      = compat_tcp_getsockopt,
2458 #endif
2459 };
2460
2461 void __init tcp_v4_init(struct net_proto_family *ops)
2462 {
2463         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2464                                      IPPROTO_TCP) < 0)
2465                 panic("Failed to create the TCP control socket.\n");
2466 }
2467
2468 EXPORT_SYMBOL(ipv4_specific);
2469 EXPORT_SYMBOL(tcp_hashinfo);
2470 EXPORT_SYMBOL(tcp_prot);
2471 EXPORT_SYMBOL(tcp_unhash);
2472 EXPORT_SYMBOL(tcp_v4_conn_request);
2473 EXPORT_SYMBOL(tcp_v4_connect);
2474 EXPORT_SYMBOL(tcp_v4_do_rcv);
2475 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2476 EXPORT_SYMBOL(tcp_v4_send_check);
2477 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2478
2479 #ifdef CONFIG_PROC_FS
2480 EXPORT_SYMBOL(tcp_proc_register);
2481 EXPORT_SYMBOL(tcp_proc_unregister);
2482 #endif
2483 EXPORT_SYMBOL(sysctl_local_port_range);
2484 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2485