Merge branch 'linus' into sched-devel
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88 /* Check TCP sequence numbers in ICMP packets. */
89 #define ICMP_MIN_LENGTH 8
90
91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
95                                                    __be32 addr);
96 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
97                                    __be32 saddr, __be32 daddr,
98                                    struct tcphdr *th, int protocol,
99                                    unsigned int tcplen);
100 #endif
101
102 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
103         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
104         .lhash_users = ATOMIC_INIT(0),
105         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
106 };
107
108 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
109 {
110         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
111                                           ip_hdr(skb)->saddr,
112                                           tcp_hdr(skb)->dest,
113                                           tcp_hdr(skb)->source);
114 }
115
116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
119         struct tcp_sock *tp = tcp_sk(sk);
120
121         /* With PAWS, it is safe from the viewpoint
122            of data integrity. Even without PAWS it is safe provided sequence
123            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124
125            Actually, the idea is close to VJ's one, only timestamp cache is
126            held not per host, but per port pair and TW bucket is used as state
127            holder.
128
129            If TW bucket has been already destroyed we fall back to VJ's scheme
130            and use initial timestamp retrieved from peer table.
131          */
132         if (tcptw->tw_ts_recent_stamp &&
133             (twp == NULL || (sysctl_tcp_tw_reuse &&
134                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
135                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
136                 if (tp->write_seq == 0)
137                         tp->write_seq = 1;
138                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
139                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140                 sock_hold(sktw);
141                 return 1;
142         }
143
144         return 0;
145 }
146
147 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
148
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151 {
152         struct inet_sock *inet = inet_sk(sk);
153         struct tcp_sock *tp = tcp_sk(sk);
154         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
155         struct rtable *rt;
156         __be32 daddr, nexthop;
157         int tmp;
158         int err;
159
160         if (addr_len < sizeof(struct sockaddr_in))
161                 return -EINVAL;
162
163         if (usin->sin_family != AF_INET)
164                 return -EAFNOSUPPORT;
165
166         nexthop = daddr = usin->sin_addr.s_addr;
167         if (inet->opt && inet->opt->srr) {
168                 if (!daddr)
169                         return -EINVAL;
170                 nexthop = inet->opt->faddr;
171         }
172
173         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
174                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175                                IPPROTO_TCP,
176                                inet->sport, usin->sin_port, sk, 1);
177         if (tmp < 0) {
178                 if (tmp == -ENETUNREACH)
179                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
180                 return tmp;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet->opt || !inet->opt->srr)
189                 daddr = rt->rt_dst;
190
191         if (!inet->saddr)
192                 inet->saddr = rt->rt_src;
193         inet->rcv_saddr = inet->saddr;
194
195         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 tp->write_seq              = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204                 struct inet_peer *peer = rt_get_peer(rt);
205                 /*
206                  * VJ's idea. We save last timestamp seen from
207                  * the destination in peer table, when entering state
208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209                  * when trying new connection.
210                  */
211                 if (peer != NULL &&
212                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
213                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
214                         tp->rx_opt.ts_recent = peer->tcp_ts;
215                 }
216         }
217
218         inet->dport = usin->sin_port;
219         inet->daddr = daddr;
220
221         inet_csk(sk)->icsk_ext_hdr_len = 0;
222         if (inet->opt)
223                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
224
225         tp->rx_opt.mss_clamp = 536;
226
227         /* Socket identity is still unknown (sport may be zero).
228          * However we set state to SYN-SENT and not releasing socket
229          * lock select source port, enter ourselves into the hash tables and
230          * complete initialization after this.
231          */
232         tcp_set_state(sk, TCP_SYN_SENT);
233         err = inet_hash_connect(&tcp_death_row, sk);
234         if (err)
235                 goto failure;
236
237         err = ip_route_newports(&rt, IPPROTO_TCP,
238                                 inet->sport, inet->dport, sk);
239         if (err)
240                 goto failure;
241
242         /* OK, now commit destination to socket.  */
243         sk->sk_gso_type = SKB_GSO_TCPV4;
244         sk_setup_caps(sk, &rt->u.dst);
245
246         if (!tp->write_seq)
247                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
248                                                            inet->daddr,
249                                                            inet->sport,
250                                                            usin->sin_port);
251
252         inet->id = tp->write_seq ^ jiffies;
253
254         err = tcp_connect(sk);
255         rt = NULL;
256         if (err)
257                 goto failure;
258
259         return 0;
260
261 failure:
262         /*
263          * This unhashes the socket and releases the local port,
264          * if necessary.
265          */
266         tcp_set_state(sk, TCP_CLOSE);
267         ip_rt_put(rt);
268         sk->sk_route_caps = 0;
269         inet->dport = 0;
270         return err;
271 }
272
273 /*
274  * This routine does path mtu discovery as defined in RFC1191.
275  */
276 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
277 {
278         struct dst_entry *dst;
279         struct inet_sock *inet = inet_sk(sk);
280
281         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
282          * send out by Linux are always <576bytes so they should go through
283          * unfragmented).
284          */
285         if (sk->sk_state == TCP_LISTEN)
286                 return;
287
288         /* We don't check in the destentry if pmtu discovery is forbidden
289          * on this route. We just assume that no packet_to_big packets
290          * are send back when pmtu discovery is not active.
291          * There is a small race when the user changes this flag in the
292          * route, but I think that's acceptable.
293          */
294         if ((dst = __sk_dst_check(sk, 0)) == NULL)
295                 return;
296
297         dst->ops->update_pmtu(dst, mtu);
298
299         /* Something is about to be wrong... Remember soft error
300          * for the case, if this connection will not able to recover.
301          */
302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303                 sk->sk_err_soft = EMSGSIZE;
304
305         mtu = dst_mtu(dst);
306
307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309                 tcp_sync_mss(sk, mtu);
310
311                 /* Resend the TCP packet because it's
312                  * clear that the old packet has been
313                  * dropped. This is the new "fast" path mtu
314                  * discovery.
315                  */
316                 tcp_simple_retransmit(sk);
317         } /* else let the usual retransmit timer handle it */
318 }
319
320 /*
321  * This routine is called by the ICMP module when it gets some
322  * sort of error condition.  If err < 0 then the socket should
323  * be closed and the error returned to the user.  If err > 0
324  * it's just the icmp type << 8 | icmp code.  After adjustment
325  * header points to the first 8 bytes of the tcp header.  We need
326  * to find the appropriate port.
327  *
328  * The locking strategy used here is very "optimistic". When
329  * someone else accesses the socket the ICMP is just dropped
330  * and for some paths there is no check at all.
331  * A more general error queue to queue errors for later handling
332  * is probably better.
333  *
334  */
335
336 void tcp_v4_err(struct sk_buff *skb, u32 info)
337 {
338         struct iphdr *iph = (struct iphdr *)skb->data;
339         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
340         struct tcp_sock *tp;
341         struct inet_sock *inet;
342         const int type = icmp_hdr(skb)->type;
343         const int code = icmp_hdr(skb)->code;
344         struct sock *sk;
345         __u32 seq;
346         int err;
347
348         if (skb->len < (iph->ihl << 2) + 8) {
349                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
350                 return;
351         }
352
353         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
354                         iph->saddr, th->source, inet_iif(skb));
355         if (!sk) {
356                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
357                 return;
358         }
359         if (sk->sk_state == TCP_TIME_WAIT) {
360                 inet_twsk_put(inet_twsk(sk));
361                 return;
362         }
363
364         bh_lock_sock(sk);
365         /* If too many ICMPs get dropped on busy
366          * servers this needs to be solved differently.
367          */
368         if (sock_owned_by_user(sk))
369                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
370
371         if (sk->sk_state == TCP_CLOSE)
372                 goto out;
373
374         tp = tcp_sk(sk);
375         seq = ntohl(th->seq);
376         if (sk->sk_state != TCP_LISTEN &&
377             !between(seq, tp->snd_una, tp->snd_nxt)) {
378                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
379                 goto out;
380         }
381
382         switch (type) {
383         case ICMP_SOURCE_QUENCH:
384                 /* Just silently ignore these. */
385                 goto out;
386         case ICMP_PARAMETERPROB:
387                 err = EPROTO;
388                 break;
389         case ICMP_DEST_UNREACH:
390                 if (code > NR_ICMP_UNREACH)
391                         goto out;
392
393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394                         if (!sock_owned_by_user(sk))
395                                 do_pmtu_discovery(sk, iph, info);
396                         goto out;
397                 }
398
399                 err = icmp_err_convert[code].errno;
400                 break;
401         case ICMP_TIME_EXCEEDED:
402                 err = EHOSTUNREACH;
403                 break;
404         default:
405                 goto out;
406         }
407
408         switch (sk->sk_state) {
409                 struct request_sock *req, **prev;
410         case TCP_LISTEN:
411                 if (sock_owned_by_user(sk))
412                         goto out;
413
414                 req = inet_csk_search_req(sk, &prev, th->dest,
415                                           iph->daddr, iph->saddr);
416                 if (!req)
417                         goto out;
418
419                 /* ICMPs are not backlogged, hence we cannot get
420                    an established socket here.
421                  */
422                 BUG_TRAP(!req->sk);
423
424                 if (seq != tcp_rsk(req)->snt_isn) {
425                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
426                         goto out;
427                 }
428
429                 /*
430                  * Still in SYN_RECV, just remove it silently.
431                  * There is no good way to pass the error to the newly
432                  * created socket, and POSIX does not want network
433                  * errors returned from accept().
434                  */
435                 inet_csk_reqsk_queue_drop(sk, req, prev);
436                 goto out;
437
438         case TCP_SYN_SENT:
439         case TCP_SYN_RECV:  /* Cannot happen.
440                                It can f.e. if SYNs crossed.
441                              */
442                 if (!sock_owned_by_user(sk)) {
443                         sk->sk_err = err;
444
445                         sk->sk_error_report(sk);
446
447                         tcp_done(sk);
448                 } else {
449                         sk->sk_err_soft = err;
450                 }
451                 goto out;
452         }
453
454         /* If we've already connected we will keep trying
455          * until we time out, or the user gives up.
456          *
457          * rfc1122 4.2.3.9 allows to consider as hard errors
458          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459          * but it is obsoleted by pmtu discovery).
460          *
461          * Note, that in modern internet, where routing is unreliable
462          * and in each dark corner broken firewalls sit, sending random
463          * errors ordered by their masters even this two messages finally lose
464          * their original sense (even Linux sends invalid PORT_UNREACHs)
465          *
466          * Now we are in compliance with RFCs.
467          *                                                      --ANK (980905)
468          */
469
470         inet = inet_sk(sk);
471         if (!sock_owned_by_user(sk) && inet->recverr) {
472                 sk->sk_err = err;
473                 sk->sk_error_report(sk);
474         } else  { /* Only an error on timeout */
475                 sk->sk_err_soft = err;
476         }
477
478 out:
479         bh_unlock_sock(sk);
480         sock_put(sk);
481 }
482
483 /* This routine computes an IPv4 TCP checksum. */
484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
485 {
486         struct inet_sock *inet = inet_sk(sk);
487         struct tcphdr *th = tcp_hdr(skb);
488
489         if (skb->ip_summed == CHECKSUM_PARTIAL) {
490                 th->check = ~tcp_v4_check(len, inet->saddr,
491                                           inet->daddr, 0);
492                 skb->csum_start = skb_transport_header(skb) - skb->head;
493                 skb->csum_offset = offsetof(struct tcphdr, check);
494         } else {
495                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
496                                          csum_partial((char *)th,
497                                                       th->doff << 2,
498                                                       skb->csum));
499         }
500 }
501
502 int tcp_v4_gso_send_check(struct sk_buff *skb)
503 {
504         const struct iphdr *iph;
505         struct tcphdr *th;
506
507         if (!pskb_may_pull(skb, sizeof(*th)))
508                 return -EINVAL;
509
510         iph = ip_hdr(skb);
511         th = tcp_hdr(skb);
512
513         th->check = 0;
514         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
515         skb->csum_start = skb_transport_header(skb) - skb->head;
516         skb->csum_offset = offsetof(struct tcphdr, check);
517         skb->ip_summed = CHECKSUM_PARTIAL;
518         return 0;
519 }
520
521 /*
522  *      This routine will send an RST to the other tcp.
523  *
524  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
525  *                    for reset.
526  *      Answer: if a packet caused RST, it is not for a socket
527  *              existing in our system, if it is matched to a socket,
528  *              it is just duplicate segment or bug in other side's TCP.
529  *              So that we build reply only basing on parameters
530  *              arrived with segment.
531  *      Exception: precedence violation. We do not implement it in any case.
532  */
533
534 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
535 {
536         struct tcphdr *th = tcp_hdr(skb);
537         struct {
538                 struct tcphdr th;
539 #ifdef CONFIG_TCP_MD5SIG
540                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
541 #endif
542         } rep;
543         struct ip_reply_arg arg;
544 #ifdef CONFIG_TCP_MD5SIG
545         struct tcp_md5sig_key *key;
546 #endif
547
548         /* Never send a reset in response to a reset. */
549         if (th->rst)
550                 return;
551
552         if (skb->rtable->rt_type != RTN_LOCAL)
553                 return;
554
555         /* Swap the send and the receive. */
556         memset(&rep, 0, sizeof(rep));
557         rep.th.dest   = th->source;
558         rep.th.source = th->dest;
559         rep.th.doff   = sizeof(struct tcphdr) / 4;
560         rep.th.rst    = 1;
561
562         if (th->ack) {
563                 rep.th.seq = th->ack_seq;
564         } else {
565                 rep.th.ack = 1;
566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567                                        skb->len - (th->doff << 2));
568         }
569
570         memset(&arg, 0, sizeof(arg));
571         arg.iov[0].iov_base = (unsigned char *)&rep;
572         arg.iov[0].iov_len  = sizeof(rep.th);
573
574 #ifdef CONFIG_TCP_MD5SIG
575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576         if (key) {
577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578                                    (TCPOPT_NOP << 16) |
579                                    (TCPOPT_MD5SIG << 8) |
580                                    TCPOLEN_MD5SIG);
581                 /* Update length and the length the header thinks exists */
582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583                 rep.th.doff = arg.iov[0].iov_len / 4;
584
585                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
586                                         key,
587                                         ip_hdr(skb)->daddr,
588                                         ip_hdr(skb)->saddr,
589                                         &rep.th, IPPROTO_TCP,
590                                         arg.iov[0].iov_len);
591         }
592 #endif
593         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
594                                       ip_hdr(skb)->saddr, /* XXX */
595                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
596         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
597
598         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
599                       &arg, arg.iov[0].iov_len);
600
601         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
602         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
603 }
604
605 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
606    outside socket context is ugly, certainly. What can I do?
607  */
608
609 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
610                             struct sk_buff *skb, u32 seq, u32 ack,
611                             u32 win, u32 ts)
612 {
613         struct tcphdr *th = tcp_hdr(skb);
614         struct {
615                 struct tcphdr th;
616                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
617 #ifdef CONFIG_TCP_MD5SIG
618                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
619 #endif
620                         ];
621         } rep;
622         struct ip_reply_arg arg;
623 #ifdef CONFIG_TCP_MD5SIG
624         struct tcp_md5sig_key *key;
625         struct tcp_md5sig_key tw_key;
626 #endif
627
628         memset(&rep.th, 0, sizeof(struct tcphdr));
629         memset(&arg, 0, sizeof(arg));
630
631         arg.iov[0].iov_base = (unsigned char *)&rep;
632         arg.iov[0].iov_len  = sizeof(rep.th);
633         if (ts) {
634                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635                                    (TCPOPT_TIMESTAMP << 8) |
636                                    TCPOLEN_TIMESTAMP);
637                 rep.opt[1] = htonl(tcp_time_stamp);
638                 rep.opt[2] = htonl(ts);
639                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
640         }
641
642         /* Swap the send and the receive. */
643         rep.th.dest    = th->source;
644         rep.th.source  = th->dest;
645         rep.th.doff    = arg.iov[0].iov_len / 4;
646         rep.th.seq     = htonl(seq);
647         rep.th.ack_seq = htonl(ack);
648         rep.th.ack     = 1;
649         rep.th.window  = htons(win);
650
651 #ifdef CONFIG_TCP_MD5SIG
652         /*
653          * The SKB holds an imcoming packet, but may not have a valid ->sk
654          * pointer. This is especially the case when we're dealing with a
655          * TIME_WAIT ack, because the sk structure is long gone, and only
656          * the tcp_timewait_sock remains. So the md5 key is stashed in that
657          * structure, and we use it in preference.  I believe that (twsk ||
658          * skb->sk) holds true, but we program defensively.
659          */
660         if (!twsk && skb->sk) {
661                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
662         } else if (twsk && twsk->tw_md5_keylen) {
663                 tw_key.key = twsk->tw_md5_key;
664                 tw_key.keylen = twsk->tw_md5_keylen;
665                 key = &tw_key;
666         } else
667                 key = NULL;
668
669         if (key) {
670                 int offset = (ts) ? 3 : 0;
671
672                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
673                                           (TCPOPT_NOP << 16) |
674                                           (TCPOPT_MD5SIG << 8) |
675                                           TCPOLEN_MD5SIG);
676                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
677                 rep.th.doff = arg.iov[0].iov_len/4;
678
679                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
680                                         key,
681                                         ip_hdr(skb)->daddr,
682                                         ip_hdr(skb)->saddr,
683                                         &rep.th, IPPROTO_TCP,
684                                         arg.iov[0].iov_len);
685         }
686 #endif
687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
688                                       ip_hdr(skb)->saddr, /* XXX */
689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
691         if (twsk)
692                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
693
694         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
695                       &arg, arg.iov[0].iov_len);
696
697         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
698 }
699
700 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
701 {
702         struct inet_timewait_sock *tw = inet_twsk(sk);
703         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
704
705         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
706                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
707                         tcptw->tw_ts_recent);
708
709         inet_twsk_put(tw);
710 }
711
712 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
713                                   struct request_sock *req)
714 {
715         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
716                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
717                         req->ts_recent);
718 }
719
720 /*
721  *      Send a SYN-ACK after having received a SYN.
722  *      This still operates on a request_sock only, not on a big
723  *      socket.
724  */
725 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
726                                 struct dst_entry *dst)
727 {
728         const struct inet_request_sock *ireq = inet_rsk(req);
729         int err = -1;
730         struct sk_buff * skb;
731
732         /* First, grab a route. */
733         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
734                 return -1;
735
736         skb = tcp_make_synack(sk, dst, req);
737
738         if (skb) {
739                 struct tcphdr *th = tcp_hdr(skb);
740
741                 th->check = tcp_v4_check(skb->len,
742                                          ireq->loc_addr,
743                                          ireq->rmt_addr,
744                                          csum_partial((char *)th, skb->len,
745                                                       skb->csum));
746
747                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
748                                             ireq->rmt_addr,
749                                             ireq->opt);
750                 err = net_xmit_eval(err);
751         }
752
753         dst_release(dst);
754         return err;
755 }
756
757 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
758 {
759         return __tcp_v4_send_synack(sk, req, NULL);
760 }
761
762 /*
763  *      IPv4 request_sock destructor.
764  */
765 static void tcp_v4_reqsk_destructor(struct request_sock *req)
766 {
767         kfree(inet_rsk(req)->opt);
768 }
769
770 #ifdef CONFIG_SYN_COOKIES
771 static void syn_flood_warning(struct sk_buff *skb)
772 {
773         static unsigned long warntime;
774
775         if (time_after(jiffies, (warntime + HZ * 60))) {
776                 warntime = jiffies;
777                 printk(KERN_INFO
778                        "possible SYN flooding on port %d. Sending cookies.\n",
779                        ntohs(tcp_hdr(skb)->dest));
780         }
781 }
782 #endif
783
784 /*
785  * Save and compile IPv4 options into the request_sock if needed.
786  */
787 static struct ip_options *tcp_v4_save_options(struct sock *sk,
788                                               struct sk_buff *skb)
789 {
790         struct ip_options *opt = &(IPCB(skb)->opt);
791         struct ip_options *dopt = NULL;
792
793         if (opt && opt->optlen) {
794                 int opt_size = optlength(opt);
795                 dopt = kmalloc(opt_size, GFP_ATOMIC);
796                 if (dopt) {
797                         if (ip_options_echo(dopt, skb)) {
798                                 kfree(dopt);
799                                 dopt = NULL;
800                         }
801                 }
802         }
803         return dopt;
804 }
805
806 #ifdef CONFIG_TCP_MD5SIG
807 /*
808  * RFC2385 MD5 checksumming requires a mapping of
809  * IP address->MD5 Key.
810  * We need to maintain these in the sk structure.
811  */
812
813 /* Find the Key structure for an address.  */
814 static struct tcp_md5sig_key *
815                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
816 {
817         struct tcp_sock *tp = tcp_sk(sk);
818         int i;
819
820         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
821                 return NULL;
822         for (i = 0; i < tp->md5sig_info->entries4; i++) {
823                 if (tp->md5sig_info->keys4[i].addr == addr)
824                         return &tp->md5sig_info->keys4[i].base;
825         }
826         return NULL;
827 }
828
829 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
830                                          struct sock *addr_sk)
831 {
832         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
833 }
834
835 EXPORT_SYMBOL(tcp_v4_md5_lookup);
836
837 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
838                                                       struct request_sock *req)
839 {
840         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
841 }
842
843 /* This can be called on a newly created socket, from other files */
844 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
845                       u8 *newkey, u8 newkeylen)
846 {
847         /* Add Key to the list */
848         struct tcp_md5sig_key *key;
849         struct tcp_sock *tp = tcp_sk(sk);
850         struct tcp4_md5sig_key *keys;
851
852         key = tcp_v4_md5_do_lookup(sk, addr);
853         if (key) {
854                 /* Pre-existing entry - just update that one. */
855                 kfree(key->key);
856                 key->key = newkey;
857                 key->keylen = newkeylen;
858         } else {
859                 struct tcp_md5sig_info *md5sig;
860
861                 if (!tp->md5sig_info) {
862                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
863                                                   GFP_ATOMIC);
864                         if (!tp->md5sig_info) {
865                                 kfree(newkey);
866                                 return -ENOMEM;
867                         }
868                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
869                 }
870                 if (tcp_alloc_md5sig_pool() == NULL) {
871                         kfree(newkey);
872                         return -ENOMEM;
873                 }
874                 md5sig = tp->md5sig_info;
875
876                 if (md5sig->alloced4 == md5sig->entries4) {
877                         keys = kmalloc((sizeof(*keys) *
878                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
879                         if (!keys) {
880                                 kfree(newkey);
881                                 tcp_free_md5sig_pool();
882                                 return -ENOMEM;
883                         }
884
885                         if (md5sig->entries4)
886                                 memcpy(keys, md5sig->keys4,
887                                        sizeof(*keys) * md5sig->entries4);
888
889                         /* Free old key list, and reference new one */
890                         kfree(md5sig->keys4);
891                         md5sig->keys4 = keys;
892                         md5sig->alloced4++;
893                 }
894                 md5sig->entries4++;
895                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
896                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
897                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
898         }
899         return 0;
900 }
901
902 EXPORT_SYMBOL(tcp_v4_md5_do_add);
903
904 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
905                                u8 *newkey, u8 newkeylen)
906 {
907         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
908                                  newkey, newkeylen);
909 }
910
911 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
912 {
913         struct tcp_sock *tp = tcp_sk(sk);
914         int i;
915
916         for (i = 0; i < tp->md5sig_info->entries4; i++) {
917                 if (tp->md5sig_info->keys4[i].addr == addr) {
918                         /* Free the key */
919                         kfree(tp->md5sig_info->keys4[i].base.key);
920                         tp->md5sig_info->entries4--;
921
922                         if (tp->md5sig_info->entries4 == 0) {
923                                 kfree(tp->md5sig_info->keys4);
924                                 tp->md5sig_info->keys4 = NULL;
925                                 tp->md5sig_info->alloced4 = 0;
926                         } else if (tp->md5sig_info->entries4 != i) {
927                                 /* Need to do some manipulation */
928                                 memmove(&tp->md5sig_info->keys4[i],
929                                         &tp->md5sig_info->keys4[i+1],
930                                         (tp->md5sig_info->entries4 - i) *
931                                          sizeof(struct tcp4_md5sig_key));
932                         }
933                         tcp_free_md5sig_pool();
934                         return 0;
935                 }
936         }
937         return -ENOENT;
938 }
939
940 EXPORT_SYMBOL(tcp_v4_md5_do_del);
941
942 static void tcp_v4_clear_md5_list(struct sock *sk)
943 {
944         struct tcp_sock *tp = tcp_sk(sk);
945
946         /* Free each key, then the set of key keys,
947          * the crypto element, and then decrement our
948          * hold on the last resort crypto.
949          */
950         if (tp->md5sig_info->entries4) {
951                 int i;
952                 for (i = 0; i < tp->md5sig_info->entries4; i++)
953                         kfree(tp->md5sig_info->keys4[i].base.key);
954                 tp->md5sig_info->entries4 = 0;
955                 tcp_free_md5sig_pool();
956         }
957         if (tp->md5sig_info->keys4) {
958                 kfree(tp->md5sig_info->keys4);
959                 tp->md5sig_info->keys4 = NULL;
960                 tp->md5sig_info->alloced4  = 0;
961         }
962 }
963
964 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
965                                  int optlen)
966 {
967         struct tcp_md5sig cmd;
968         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
969         u8 *newkey;
970
971         if (optlen < sizeof(cmd))
972                 return -EINVAL;
973
974         if (copy_from_user(&cmd, optval, sizeof(cmd)))
975                 return -EFAULT;
976
977         if (sin->sin_family != AF_INET)
978                 return -EINVAL;
979
980         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
981                 if (!tcp_sk(sk)->md5sig_info)
982                         return -ENOENT;
983                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
984         }
985
986         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
987                 return -EINVAL;
988
989         if (!tcp_sk(sk)->md5sig_info) {
990                 struct tcp_sock *tp = tcp_sk(sk);
991                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
992
993                 if (!p)
994                         return -EINVAL;
995
996                 tp->md5sig_info = p;
997                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
998         }
999
1000         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1001         if (!newkey)
1002                 return -ENOMEM;
1003         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1004                                  newkey, cmd.tcpm_keylen);
1005 }
1006
1007 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1008                                    __be32 saddr, __be32 daddr,
1009                                    struct tcphdr *th, int protocol,
1010                                    unsigned int tcplen)
1011 {
1012         struct scatterlist sg[4];
1013         __u16 data_len;
1014         int block = 0;
1015         __sum16 old_checksum;
1016         struct tcp_md5sig_pool *hp;
1017         struct tcp4_pseudohdr *bp;
1018         struct hash_desc *desc;
1019         int err;
1020         unsigned int nbytes = 0;
1021
1022         /*
1023          * Okay, so RFC2385 is turned on for this connection,
1024          * so we need to generate the MD5 hash for the packet now.
1025          */
1026
1027         hp = tcp_get_md5sig_pool();
1028         if (!hp)
1029                 goto clear_hash_noput;
1030
1031         bp = &hp->md5_blk.ip4;
1032         desc = &hp->md5_desc;
1033
1034         /*
1035          * 1. the TCP pseudo-header (in the order: source IP address,
1036          * destination IP address, zero-padded protocol number, and
1037          * segment length)
1038          */
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = protocol;
1043         bp->len = htons(tcplen);
1044
1045         sg_init_table(sg, 4);
1046
1047         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1048         nbytes += sizeof(*bp);
1049
1050         /* 2. the TCP header, excluding options, and assuming a
1051          * checksum of zero/
1052          */
1053         old_checksum = th->check;
1054         th->check = 0;
1055         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1056         nbytes += sizeof(struct tcphdr);
1057
1058         /* 3. the TCP segment data (if any) */
1059         data_len = tcplen - (th->doff << 2);
1060         if (data_len > 0) {
1061                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1062                 sg_set_buf(&sg[block++], data, data_len);
1063                 nbytes += data_len;
1064         }
1065
1066         /* 4. an independently-specified key or password, known to both
1067          * TCPs and presumably connection-specific
1068          */
1069         sg_set_buf(&sg[block++], key->key, key->keylen);
1070         nbytes += key->keylen;
1071
1072         sg_mark_end(&sg[block - 1]);
1073
1074         /* Now store the Hash into the packet */
1075         err = crypto_hash_init(desc);
1076         if (err)
1077                 goto clear_hash;
1078         err = crypto_hash_update(desc, sg, nbytes);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_final(desc, md5_hash);
1082         if (err)
1083                 goto clear_hash;
1084
1085         /* Reset header, and free up the crypto */
1086         tcp_put_md5sig_pool();
1087         th->check = old_checksum;
1088
1089 out:
1090         return 0;
1091 clear_hash:
1092         tcp_put_md5sig_pool();
1093 clear_hash_noput:
1094         memset(md5_hash, 0, 16);
1095         goto out;
1096 }
1097
1098 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1099                          struct sock *sk,
1100                          struct dst_entry *dst,
1101                          struct request_sock *req,
1102                          struct tcphdr *th, int protocol,
1103                          unsigned int tcplen)
1104 {
1105         __be32 saddr, daddr;
1106
1107         if (sk) {
1108                 saddr = inet_sk(sk)->saddr;
1109                 daddr = inet_sk(sk)->daddr;
1110         } else {
1111                 struct rtable *rt = (struct rtable *)dst;
1112                 BUG_ON(!rt);
1113                 saddr = rt->rt_src;
1114                 daddr = rt->rt_dst;
1115         }
1116         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1117                                        saddr, daddr,
1118                                        th, protocol, tcplen);
1119 }
1120
1121 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1122
1123 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1124 {
1125         /*
1126          * This gets called for each TCP segment that arrives
1127          * so we want to be efficient.
1128          * We have 3 drop cases:
1129          * o No MD5 hash and one expected.
1130          * o MD5 hash and we're not expecting one.
1131          * o MD5 hash and its wrong.
1132          */
1133         __u8 *hash_location = NULL;
1134         struct tcp_md5sig_key *hash_expected;
1135         const struct iphdr *iph = ip_hdr(skb);
1136         struct tcphdr *th = tcp_hdr(skb);
1137         int length = (th->doff << 2) - sizeof(struct tcphdr);
1138         int genhash;
1139         unsigned char *ptr;
1140         unsigned char newhash[16];
1141
1142         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1143
1144         /*
1145          * If the TCP option length is less than the TCP_MD5SIG
1146          * option length, then we can shortcut
1147          */
1148         if (length < TCPOLEN_MD5SIG) {
1149                 if (hash_expected)
1150                         return 1;
1151                 else
1152                         return 0;
1153         }
1154
1155         /* Okay, we can't shortcut - we have to grub through the options */
1156         ptr = (unsigned char *)(th + 1);
1157         while (length > 0) {
1158                 int opcode = *ptr++;
1159                 int opsize;
1160
1161                 switch (opcode) {
1162                 case TCPOPT_EOL:
1163                         goto done_opts;
1164                 case TCPOPT_NOP:
1165                         length--;
1166                         continue;
1167                 default:
1168                         opsize = *ptr++;
1169                         if (opsize < 2)
1170                                 goto done_opts;
1171                         if (opsize > length)
1172                                 goto done_opts;
1173
1174                         if (opcode == TCPOPT_MD5SIG) {
1175                                 hash_location = ptr;
1176                                 goto done_opts;
1177                         }
1178                 }
1179                 ptr += opsize-2;
1180                 length -= opsize;
1181         }
1182 done_opts:
1183         /* We've parsed the options - do we have a hash? */
1184         if (!hash_expected && !hash_location)
1185                 return 0;
1186
1187         if (hash_expected && !hash_location) {
1188                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1189                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1190                                NIPQUAD(iph->saddr), ntohs(th->source),
1191                                NIPQUAD(iph->daddr), ntohs(th->dest));
1192                 return 1;
1193         }
1194
1195         if (!hash_expected && hash_location) {
1196                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1197                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1198                                NIPQUAD(iph->saddr), ntohs(th->source),
1199                                NIPQUAD(iph->daddr), ntohs(th->dest));
1200                 return 1;
1201         }
1202
1203         /* Okay, so this is hash_expected and hash_location -
1204          * so we need to calculate the checksum.
1205          */
1206         genhash = tcp_v4_do_calc_md5_hash(newhash,
1207                                           hash_expected,
1208                                           iph->saddr, iph->daddr,
1209                                           th, sk->sk_protocol,
1210                                           skb->len);
1211
1212         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1213                 if (net_ratelimit()) {
1214                         printk(KERN_INFO "MD5 Hash failed for "
1215                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1216                                NIPQUAD(iph->saddr), ntohs(th->source),
1217                                NIPQUAD(iph->daddr), ntohs(th->dest),
1218                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1219                 }
1220                 return 1;
1221         }
1222         return 0;
1223 }
1224
1225 #endif
1226
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228         .family         =       PF_INET,
1229         .obj_size       =       sizeof(struct tcp_request_sock),
1230         .rtx_syn_ack    =       tcp_v4_send_synack,
1231         .send_ack       =       tcp_v4_reqsk_send_ack,
1232         .destructor     =       tcp_v4_reqsk_destructor,
1233         .send_reset     =       tcp_v4_send_reset,
1234 };
1235
1236 #ifdef CONFIG_TCP_MD5SIG
1237 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1239 };
1240 #endif
1241
1242 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1243         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1244         .twsk_unique    = tcp_twsk_unique,
1245         .twsk_destructor= tcp_twsk_destructor,
1246 };
1247
1248 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1249 {
1250         struct inet_request_sock *ireq;
1251         struct tcp_options_received tmp_opt;
1252         struct request_sock *req;
1253         __be32 saddr = ip_hdr(skb)->saddr;
1254         __be32 daddr = ip_hdr(skb)->daddr;
1255         __u32 isn = TCP_SKB_CB(skb)->when;
1256         struct dst_entry *dst = NULL;
1257 #ifdef CONFIG_SYN_COOKIES
1258         int want_cookie = 0;
1259 #else
1260 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1261 #endif
1262
1263         /* Never answer to SYNs send to broadcast or multicast */
1264         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1265                 goto drop;
1266
1267         /* TW buckets are converted to open requests without
1268          * limitations, they conserve resources and peer is
1269          * evidently real one.
1270          */
1271         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1272 #ifdef CONFIG_SYN_COOKIES
1273                 if (sysctl_tcp_syncookies) {
1274                         want_cookie = 1;
1275                 } else
1276 #endif
1277                 goto drop;
1278         }
1279
1280         /* Accept backlog is full. If we have already queued enough
1281          * of warm entries in syn queue, drop request. It is better than
1282          * clogging syn queue with openreqs with exponentially increasing
1283          * timeout.
1284          */
1285         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1286                 goto drop;
1287
1288         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1289         if (!req)
1290                 goto drop;
1291
1292 #ifdef CONFIG_TCP_MD5SIG
1293         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1294 #endif
1295
1296         tcp_clear_options(&tmp_opt);
1297         tmp_opt.mss_clamp = 536;
1298         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1299
1300         tcp_parse_options(skb, &tmp_opt, 0);
1301
1302         if (want_cookie && !tmp_opt.saw_tstamp)
1303                 tcp_clear_options(&tmp_opt);
1304
1305         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1306                 /* Some OSes (unknown ones, but I see them on web server, which
1307                  * contains information interesting only for windows'
1308                  * users) do not send their stamp in SYN. It is easy case.
1309                  * We simply do not advertise TS support.
1310                  */
1311                 tmp_opt.saw_tstamp = 0;
1312                 tmp_opt.tstamp_ok  = 0;
1313         }
1314         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315
1316         tcp_openreq_init(req, &tmp_opt, skb);
1317
1318         if (security_inet_conn_request(sk, skb, req))
1319                 goto drop_and_free;
1320
1321         ireq = inet_rsk(req);
1322         ireq->loc_addr = daddr;
1323         ireq->rmt_addr = saddr;
1324         ireq->opt = tcp_v4_save_options(sk, skb);
1325         if (!want_cookie)
1326                 TCP_ECN_create_request(req, tcp_hdr(skb));
1327
1328         if (want_cookie) {
1329 #ifdef CONFIG_SYN_COOKIES
1330                 syn_flood_warning(skb);
1331                 req->cookie_ts = tmp_opt.tstamp_ok;
1332 #endif
1333                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1334         } else if (!isn) {
1335                 struct inet_peer *peer = NULL;
1336
1337                 /* VJ's idea. We save last timestamp seen
1338                  * from the destination in peer table, when entering
1339                  * state TIME-WAIT, and check against it before
1340                  * accepting new connection request.
1341                  *
1342                  * If "isn" is not zero, this request hit alive
1343                  * timewait bucket, so that all the necessary checks
1344                  * are made in the function processing timewait state.
1345                  */
1346                 if (tmp_opt.saw_tstamp &&
1347                     tcp_death_row.sysctl_tw_recycle &&
1348                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1349                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350                     peer->v4daddr == saddr) {
1351                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1352                             (s32)(peer->tcp_ts - req->ts_recent) >
1353                                                         TCP_PAWS_WINDOW) {
1354                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1355                                 goto drop_and_release;
1356                         }
1357                 }
1358                 /* Kill the following clause, if you dislike this way. */
1359                 else if (!sysctl_tcp_syncookies &&
1360                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1361                           (sysctl_max_syn_backlog >> 2)) &&
1362                          (!peer || !peer->tcp_ts_stamp) &&
1363                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1364                         /* Without syncookies last quarter of
1365                          * backlog is filled with destinations,
1366                          * proven to be alive.
1367                          * It means that we continue to communicate
1368                          * to destinations, already remembered
1369                          * to the moment of synflood.
1370                          */
1371                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1372                                        "request from " NIPQUAD_FMT "/%u\n",
1373                                        NIPQUAD(saddr),
1374                                        ntohs(tcp_hdr(skb)->source));
1375                         goto drop_and_release;
1376                 }
1377
1378                 isn = tcp_v4_init_sequence(skb);
1379         }
1380         tcp_rsk(req)->snt_isn = isn;
1381
1382         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1383                 goto drop_and_free;
1384
1385         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1386         return 0;
1387
1388 drop_and_release:
1389         dst_release(dst);
1390 drop_and_free:
1391         reqsk_free(req);
1392 drop:
1393         return 0;
1394 }
1395
1396
1397 /*
1398  * The three way handshake has completed - we got a valid synack -
1399  * now create the new socket.
1400  */
1401 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1402                                   struct request_sock *req,
1403                                   struct dst_entry *dst)
1404 {
1405         struct inet_request_sock *ireq;
1406         struct inet_sock *newinet;
1407         struct tcp_sock *newtp;
1408         struct sock *newsk;
1409 #ifdef CONFIG_TCP_MD5SIG
1410         struct tcp_md5sig_key *key;
1411 #endif
1412
1413         if (sk_acceptq_is_full(sk))
1414                 goto exit_overflow;
1415
1416         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1417                 goto exit;
1418
1419         newsk = tcp_create_openreq_child(sk, req, skb);
1420         if (!newsk)
1421                 goto exit;
1422
1423         newsk->sk_gso_type = SKB_GSO_TCPV4;
1424         sk_setup_caps(newsk, dst);
1425
1426         newtp                 = tcp_sk(newsk);
1427         newinet               = inet_sk(newsk);
1428         ireq                  = inet_rsk(req);
1429         newinet->daddr        = ireq->rmt_addr;
1430         newinet->rcv_saddr    = ireq->loc_addr;
1431         newinet->saddr        = ireq->loc_addr;
1432         newinet->opt          = ireq->opt;
1433         ireq->opt             = NULL;
1434         newinet->mc_index     = inet_iif(skb);
1435         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1436         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1437         if (newinet->opt)
1438                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1439         newinet->id = newtp->write_seq ^ jiffies;
1440
1441         tcp_mtup_init(newsk);
1442         tcp_sync_mss(newsk, dst_mtu(dst));
1443         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1444         tcp_initialize_rcv_mss(newsk);
1445
1446 #ifdef CONFIG_TCP_MD5SIG
1447         /* Copy over the MD5 key from the original socket */
1448         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1449                 /*
1450                  * We're using one, so create a matching key
1451                  * on the newsk structure. If we fail to get
1452                  * memory, then we end up not copying the key
1453                  * across. Shucks.
1454                  */
1455                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1456                 if (newkey != NULL)
1457                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1458                                           newkey, key->keylen);
1459         }
1460 #endif
1461
1462         __inet_hash_nolisten(newsk);
1463         __inet_inherit_port(sk, newsk);
1464
1465         return newsk;
1466
1467 exit_overflow:
1468         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1469 exit:
1470         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1471         dst_release(dst);
1472         return NULL;
1473 }
1474
1475 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1476 {
1477         struct tcphdr *th = tcp_hdr(skb);
1478         const struct iphdr *iph = ip_hdr(skb);
1479         struct sock *nsk;
1480         struct request_sock **prev;
1481         /* Find possible connection requests. */
1482         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1483                                                        iph->saddr, iph->daddr);
1484         if (req)
1485                 return tcp_check_req(sk, skb, req, prev);
1486
1487         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1488                         th->source, iph->daddr, th->dest, inet_iif(skb));
1489
1490         if (nsk) {
1491                 if (nsk->sk_state != TCP_TIME_WAIT) {
1492                         bh_lock_sock(nsk);
1493                         return nsk;
1494                 }
1495                 inet_twsk_put(inet_twsk(nsk));
1496                 return NULL;
1497         }
1498
1499 #ifdef CONFIG_SYN_COOKIES
1500         if (!th->rst && !th->syn && th->ack)
1501                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1502 #endif
1503         return sk;
1504 }
1505
1506 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1507 {
1508         const struct iphdr *iph = ip_hdr(skb);
1509
1510         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1511                 if (!tcp_v4_check(skb->len, iph->saddr,
1512                                   iph->daddr, skb->csum)) {
1513                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1514                         return 0;
1515                 }
1516         }
1517
1518         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1519                                        skb->len, IPPROTO_TCP, 0);
1520
1521         if (skb->len <= 76) {
1522                 return __skb_checksum_complete(skb);
1523         }
1524         return 0;
1525 }
1526
1527
1528 /* The socket must have it's spinlock held when we get
1529  * here.
1530  *
1531  * We have a potential double-lock case here, so even when
1532  * doing backlog processing we use the BH locking scheme.
1533  * This is because we cannot sleep with the original spinlock
1534  * held.
1535  */
1536 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1537 {
1538         struct sock *rsk;
1539 #ifdef CONFIG_TCP_MD5SIG
1540         /*
1541          * We really want to reject the packet as early as possible
1542          * if:
1543          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1544          *  o There is an MD5 option and we're not expecting one
1545          */
1546         if (tcp_v4_inbound_md5_hash(sk, skb))
1547                 goto discard;
1548 #endif
1549
1550         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1551                 TCP_CHECK_TIMER(sk);
1552                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1553                         rsk = sk;
1554                         goto reset;
1555                 }
1556                 TCP_CHECK_TIMER(sk);
1557                 return 0;
1558         }
1559
1560         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1561                 goto csum_err;
1562
1563         if (sk->sk_state == TCP_LISTEN) {
1564                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1565                 if (!nsk)
1566                         goto discard;
1567
1568                 if (nsk != sk) {
1569                         if (tcp_child_process(sk, nsk, skb)) {
1570                                 rsk = nsk;
1571                                 goto reset;
1572                         }
1573                         return 0;
1574                 }
1575         }
1576
1577         TCP_CHECK_TIMER(sk);
1578         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1579                 rsk = sk;
1580                 goto reset;
1581         }
1582         TCP_CHECK_TIMER(sk);
1583         return 0;
1584
1585 reset:
1586         tcp_v4_send_reset(rsk, skb);
1587 discard:
1588         kfree_skb(skb);
1589         /* Be careful here. If this function gets more complicated and
1590          * gcc suffers from register pressure on the x86, sk (in %ebx)
1591          * might be destroyed here. This current version compiles correctly,
1592          * but you have been warned.
1593          */
1594         return 0;
1595
1596 csum_err:
1597         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1598         goto discard;
1599 }
1600
1601 /*
1602  *      From tcp_input.c
1603  */
1604
1605 int tcp_v4_rcv(struct sk_buff *skb)
1606 {
1607         const struct iphdr *iph;
1608         struct tcphdr *th;
1609         struct sock *sk;
1610         int ret;
1611
1612         if (skb->pkt_type != PACKET_HOST)
1613                 goto discard_it;
1614
1615         /* Count it even if it's bad */
1616         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1617
1618         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1619                 goto discard_it;
1620
1621         th = tcp_hdr(skb);
1622
1623         if (th->doff < sizeof(struct tcphdr) / 4)
1624                 goto bad_packet;
1625         if (!pskb_may_pull(skb, th->doff * 4))
1626                 goto discard_it;
1627
1628         /* An explanation is required here, I think.
1629          * Packet length and doff are validated by header prediction,
1630          * provided case of th->doff==0 is eliminated.
1631          * So, we defer the checks. */
1632         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1633                 goto bad_packet;
1634
1635         th = tcp_hdr(skb);
1636         iph = ip_hdr(skb);
1637         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1638         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1639                                     skb->len - th->doff * 4);
1640         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1641         TCP_SKB_CB(skb)->when    = 0;
1642         TCP_SKB_CB(skb)->flags   = iph->tos;
1643         TCP_SKB_CB(skb)->sacked  = 0;
1644
1645         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1646                         th->source, iph->daddr, th->dest, inet_iif(skb));
1647         if (!sk)
1648                 goto no_tcp_socket;
1649
1650 process:
1651         if (sk->sk_state == TCP_TIME_WAIT)
1652                 goto do_time_wait;
1653
1654         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1655                 goto discard_and_relse;
1656         nf_reset(skb);
1657
1658         if (sk_filter(sk, skb))
1659                 goto discard_and_relse;
1660
1661         skb->dev = NULL;
1662
1663         bh_lock_sock_nested(sk);
1664         ret = 0;
1665         if (!sock_owned_by_user(sk)) {
1666 #ifdef CONFIG_NET_DMA
1667                 struct tcp_sock *tp = tcp_sk(sk);
1668                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1669                         tp->ucopy.dma_chan = get_softnet_dma();
1670                 if (tp->ucopy.dma_chan)
1671                         ret = tcp_v4_do_rcv(sk, skb);
1672                 else
1673 #endif
1674                 {
1675                         if (!tcp_prequeue(sk, skb))
1676                         ret = tcp_v4_do_rcv(sk, skb);
1677                 }
1678         } else
1679                 sk_add_backlog(sk, skb);
1680         bh_unlock_sock(sk);
1681
1682         sock_put(sk);
1683
1684         return ret;
1685
1686 no_tcp_socket:
1687         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1688                 goto discard_it;
1689
1690         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1691 bad_packet:
1692                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1693         } else {
1694                 tcp_v4_send_reset(NULL, skb);
1695         }
1696
1697 discard_it:
1698         /* Discard frame. */
1699         kfree_skb(skb);
1700         return 0;
1701
1702 discard_and_relse:
1703         sock_put(sk);
1704         goto discard_it;
1705
1706 do_time_wait:
1707         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1708                 inet_twsk_put(inet_twsk(sk));
1709                 goto discard_it;
1710         }
1711
1712         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1713                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1714                 inet_twsk_put(inet_twsk(sk));
1715                 goto discard_it;
1716         }
1717         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1718         case TCP_TW_SYN: {
1719                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1720                                                         &tcp_hashinfo,
1721                                                         iph->daddr, th->dest,
1722                                                         inet_iif(skb));
1723                 if (sk2) {
1724                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1725                         inet_twsk_put(inet_twsk(sk));
1726                         sk = sk2;
1727                         goto process;
1728                 }
1729                 /* Fall through to ACK */
1730         }
1731         case TCP_TW_ACK:
1732                 tcp_v4_timewait_ack(sk, skb);
1733                 break;
1734         case TCP_TW_RST:
1735                 goto no_tcp_socket;
1736         case TCP_TW_SUCCESS:;
1737         }
1738         goto discard_it;
1739 }
1740
1741 /* VJ's idea. Save last timestamp seen from this destination
1742  * and hold it at least for normal timewait interval to use for duplicate
1743  * segment detection in subsequent connections, before they enter synchronized
1744  * state.
1745  */
1746
1747 int tcp_v4_remember_stamp(struct sock *sk)
1748 {
1749         struct inet_sock *inet = inet_sk(sk);
1750         struct tcp_sock *tp = tcp_sk(sk);
1751         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1752         struct inet_peer *peer = NULL;
1753         int release_it = 0;
1754
1755         if (!rt || rt->rt_dst != inet->daddr) {
1756                 peer = inet_getpeer(inet->daddr, 1);
1757                 release_it = 1;
1758         } else {
1759                 if (!rt->peer)
1760                         rt_bind_peer(rt, 1);
1761                 peer = rt->peer;
1762         }
1763
1764         if (peer) {
1765                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1766                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1767                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1768                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1769                         peer->tcp_ts = tp->rx_opt.ts_recent;
1770                 }
1771                 if (release_it)
1772                         inet_putpeer(peer);
1773                 return 1;
1774         }
1775
1776         return 0;
1777 }
1778
1779 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1780 {
1781         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1782
1783         if (peer) {
1784                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1785
1786                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1787                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1788                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1789                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1790                         peer->tcp_ts       = tcptw->tw_ts_recent;
1791                 }
1792                 inet_putpeer(peer);
1793                 return 1;
1794         }
1795
1796         return 0;
1797 }
1798
1799 struct inet_connection_sock_af_ops ipv4_specific = {
1800         .queue_xmit        = ip_queue_xmit,
1801         .send_check        = tcp_v4_send_check,
1802         .rebuild_header    = inet_sk_rebuild_header,
1803         .conn_request      = tcp_v4_conn_request,
1804         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1805         .remember_stamp    = tcp_v4_remember_stamp,
1806         .net_header_len    = sizeof(struct iphdr),
1807         .setsockopt        = ip_setsockopt,
1808         .getsockopt        = ip_getsockopt,
1809         .addr2sockaddr     = inet_csk_addr2sockaddr,
1810         .sockaddr_len      = sizeof(struct sockaddr_in),
1811         .bind_conflict     = inet_csk_bind_conflict,
1812 #ifdef CONFIG_COMPAT
1813         .compat_setsockopt = compat_ip_setsockopt,
1814         .compat_getsockopt = compat_ip_getsockopt,
1815 #endif
1816 };
1817
1818 #ifdef CONFIG_TCP_MD5SIG
1819 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1820         .md5_lookup             = tcp_v4_md5_lookup,
1821         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1822         .md5_add                = tcp_v4_md5_add_func,
1823         .md5_parse              = tcp_v4_parse_md5_keys,
1824 };
1825 #endif
1826
1827 /* NOTE: A lot of things set to zero explicitly by call to
1828  *       sk_alloc() so need not be done here.
1829  */
1830 static int tcp_v4_init_sock(struct sock *sk)
1831 {
1832         struct inet_connection_sock *icsk = inet_csk(sk);
1833         struct tcp_sock *tp = tcp_sk(sk);
1834
1835         skb_queue_head_init(&tp->out_of_order_queue);
1836         tcp_init_xmit_timers(sk);
1837         tcp_prequeue_init(tp);
1838
1839         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1840         tp->mdev = TCP_TIMEOUT_INIT;
1841
1842         /* So many TCP implementations out there (incorrectly) count the
1843          * initial SYN frame in their delayed-ACK and congestion control
1844          * algorithms that we must have the following bandaid to talk
1845          * efficiently to them.  -DaveM
1846          */
1847         tp->snd_cwnd = 2;
1848
1849         /* See draft-stevens-tcpca-spec-01 for discussion of the
1850          * initialization of these values.
1851          */
1852         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1853         tp->snd_cwnd_clamp = ~0;
1854         tp->mss_cache = 536;
1855
1856         tp->reordering = sysctl_tcp_reordering;
1857         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1858
1859         sk->sk_state = TCP_CLOSE;
1860
1861         sk->sk_write_space = sk_stream_write_space;
1862         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1863
1864         icsk->icsk_af_ops = &ipv4_specific;
1865         icsk->icsk_sync_mss = tcp_sync_mss;
1866 #ifdef CONFIG_TCP_MD5SIG
1867         tp->af_specific = &tcp_sock_ipv4_specific;
1868 #endif
1869
1870         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1871         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1872
1873         atomic_inc(&tcp_sockets_allocated);
1874
1875         return 0;
1876 }
1877
1878 int tcp_v4_destroy_sock(struct sock *sk)
1879 {
1880         struct tcp_sock *tp = tcp_sk(sk);
1881
1882         tcp_clear_xmit_timers(sk);
1883
1884         tcp_cleanup_congestion_control(sk);
1885
1886         /* Cleanup up the write buffer. */
1887         tcp_write_queue_purge(sk);
1888
1889         /* Cleans up our, hopefully empty, out_of_order_queue. */
1890         __skb_queue_purge(&tp->out_of_order_queue);
1891
1892 #ifdef CONFIG_TCP_MD5SIG
1893         /* Clean up the MD5 key list, if any */
1894         if (tp->md5sig_info) {
1895                 tcp_v4_clear_md5_list(sk);
1896                 kfree(tp->md5sig_info);
1897                 tp->md5sig_info = NULL;
1898         }
1899 #endif
1900
1901 #ifdef CONFIG_NET_DMA
1902         /* Cleans up our sk_async_wait_queue */
1903         __skb_queue_purge(&sk->sk_async_wait_queue);
1904 #endif
1905
1906         /* Clean prequeue, it must be empty really */
1907         __skb_queue_purge(&tp->ucopy.prequeue);
1908
1909         /* Clean up a referenced TCP bind bucket. */
1910         if (inet_csk(sk)->icsk_bind_hash)
1911                 inet_put_port(sk);
1912
1913         /*
1914          * If sendmsg cached page exists, toss it.
1915          */
1916         if (sk->sk_sndmsg_page) {
1917                 __free_page(sk->sk_sndmsg_page);
1918                 sk->sk_sndmsg_page = NULL;
1919         }
1920
1921         atomic_dec(&tcp_sockets_allocated);
1922
1923         return 0;
1924 }
1925
1926 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1927
1928 #ifdef CONFIG_PROC_FS
1929 /* Proc filesystem TCP sock list dumping. */
1930
1931 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1932 {
1933         return hlist_empty(head) ? NULL :
1934                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1935 }
1936
1937 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1938 {
1939         return tw->tw_node.next ?
1940                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1941 }
1942
1943 static void *listening_get_next(struct seq_file *seq, void *cur)
1944 {
1945         struct inet_connection_sock *icsk;
1946         struct hlist_node *node;
1947         struct sock *sk = cur;
1948         struct tcp_iter_state* st = seq->private;
1949         struct net *net = seq_file_net(seq);
1950
1951         if (!sk) {
1952                 st->bucket = 0;
1953                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1954                 goto get_sk;
1955         }
1956
1957         ++st->num;
1958
1959         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1960                 struct request_sock *req = cur;
1961
1962                 icsk = inet_csk(st->syn_wait_sk);
1963                 req = req->dl_next;
1964                 while (1) {
1965                         while (req) {
1966                                 if (req->rsk_ops->family == st->family &&
1967                                     net_eq(sock_net(req->sk), net)) {
1968                                         cur = req;
1969                                         goto out;
1970                                 }
1971                                 req = req->dl_next;
1972                         }
1973                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1974                                 break;
1975 get_req:
1976                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1977                 }
1978                 sk        = sk_next(st->syn_wait_sk);
1979                 st->state = TCP_SEQ_STATE_LISTENING;
1980                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1981         } else {
1982                 icsk = inet_csk(sk);
1983                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1984                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1985                         goto start_req;
1986                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1987                 sk = sk_next(sk);
1988         }
1989 get_sk:
1990         sk_for_each_from(sk, node) {
1991                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1992                         cur = sk;
1993                         goto out;
1994                 }
1995                 icsk = inet_csk(sk);
1996                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1997                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1998 start_req:
1999                         st->uid         = sock_i_uid(sk);
2000                         st->syn_wait_sk = sk;
2001                         st->state       = TCP_SEQ_STATE_OPENREQ;
2002                         st->sbucket     = 0;
2003                         goto get_req;
2004                 }
2005                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2006         }
2007         if (++st->bucket < INET_LHTABLE_SIZE) {
2008                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2009                 goto get_sk;
2010         }
2011         cur = NULL;
2012 out:
2013         return cur;
2014 }
2015
2016 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2017 {
2018         void *rc = listening_get_next(seq, NULL);
2019
2020         while (rc && *pos) {
2021                 rc = listening_get_next(seq, rc);
2022                 --*pos;
2023         }
2024         return rc;
2025 }
2026
2027 static void *established_get_first(struct seq_file *seq)
2028 {
2029         struct tcp_iter_state* st = seq->private;
2030         struct net *net = seq_file_net(seq);
2031         void *rc = NULL;
2032
2033         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2034                 struct sock *sk;
2035                 struct hlist_node *node;
2036                 struct inet_timewait_sock *tw;
2037                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2038
2039                 read_lock_bh(lock);
2040                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2041                         if (sk->sk_family != st->family ||
2042                             !net_eq(sock_net(sk), net)) {
2043                                 continue;
2044                         }
2045                         rc = sk;
2046                         goto out;
2047                 }
2048                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2049                 inet_twsk_for_each(tw, node,
2050                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2051                         if (tw->tw_family != st->family ||
2052                             !net_eq(twsk_net(tw), net)) {
2053                                 continue;
2054                         }
2055                         rc = tw;
2056                         goto out;
2057                 }
2058                 read_unlock_bh(lock);
2059                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2060         }
2061 out:
2062         return rc;
2063 }
2064
2065 static void *established_get_next(struct seq_file *seq, void *cur)
2066 {
2067         struct sock *sk = cur;
2068         struct inet_timewait_sock *tw;
2069         struct hlist_node *node;
2070         struct tcp_iter_state* st = seq->private;
2071         struct net *net = seq_file_net(seq);
2072
2073         ++st->num;
2074
2075         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2076                 tw = cur;
2077                 tw = tw_next(tw);
2078 get_tw:
2079                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2080                         tw = tw_next(tw);
2081                 }
2082                 if (tw) {
2083                         cur = tw;
2084                         goto out;
2085                 }
2086                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2087                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2088
2089                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2090                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2091                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2092                 } else {
2093                         cur = NULL;
2094                         goto out;
2095                 }
2096         } else
2097                 sk = sk_next(sk);
2098
2099         sk_for_each_from(sk, node) {
2100                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2101                         goto found;
2102         }
2103
2104         st->state = TCP_SEQ_STATE_TIME_WAIT;
2105         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2106         goto get_tw;
2107 found:
2108         cur = sk;
2109 out:
2110         return cur;
2111 }
2112
2113 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2114 {
2115         void *rc = established_get_first(seq);
2116
2117         while (rc && pos) {
2118                 rc = established_get_next(seq, rc);
2119                 --pos;
2120         }
2121         return rc;
2122 }
2123
2124 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2125 {
2126         void *rc;
2127         struct tcp_iter_state* st = seq->private;
2128
2129         inet_listen_lock(&tcp_hashinfo);
2130         st->state = TCP_SEQ_STATE_LISTENING;
2131         rc        = listening_get_idx(seq, &pos);
2132
2133         if (!rc) {
2134                 inet_listen_unlock(&tcp_hashinfo);
2135                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2136                 rc        = established_get_idx(seq, pos);
2137         }
2138
2139         return rc;
2140 }
2141
2142 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2143 {
2144         struct tcp_iter_state* st = seq->private;
2145         st->state = TCP_SEQ_STATE_LISTENING;
2146         st->num = 0;
2147         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2148 }
2149
2150 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2151 {
2152         void *rc = NULL;
2153         struct tcp_iter_state* st;
2154
2155         if (v == SEQ_START_TOKEN) {
2156                 rc = tcp_get_idx(seq, 0);
2157                 goto out;
2158         }
2159         st = seq->private;
2160
2161         switch (st->state) {
2162         case TCP_SEQ_STATE_OPENREQ:
2163         case TCP_SEQ_STATE_LISTENING:
2164                 rc = listening_get_next(seq, v);
2165                 if (!rc) {
2166                         inet_listen_unlock(&tcp_hashinfo);
2167                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2168                         rc        = established_get_first(seq);
2169                 }
2170                 break;
2171         case TCP_SEQ_STATE_ESTABLISHED:
2172         case TCP_SEQ_STATE_TIME_WAIT:
2173                 rc = established_get_next(seq, v);
2174                 break;
2175         }
2176 out:
2177         ++*pos;
2178         return rc;
2179 }
2180
2181 static void tcp_seq_stop(struct seq_file *seq, void *v)
2182 {
2183         struct tcp_iter_state* st = seq->private;
2184
2185         switch (st->state) {
2186         case TCP_SEQ_STATE_OPENREQ:
2187                 if (v) {
2188                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2189                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2190                 }
2191         case TCP_SEQ_STATE_LISTENING:
2192                 if (v != SEQ_START_TOKEN)
2193                         inet_listen_unlock(&tcp_hashinfo);
2194                 break;
2195         case TCP_SEQ_STATE_TIME_WAIT:
2196         case TCP_SEQ_STATE_ESTABLISHED:
2197                 if (v)
2198                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2199                 break;
2200         }
2201 }
2202
2203 static int tcp_seq_open(struct inode *inode, struct file *file)
2204 {
2205         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2206         struct tcp_iter_state *s;
2207         int err;
2208
2209         err = seq_open_net(inode, file, &afinfo->seq_ops,
2210                           sizeof(struct tcp_iter_state));
2211         if (err < 0)
2212                 return err;
2213
2214         s = ((struct seq_file *)file->private_data)->private;
2215         s->family               = afinfo->family;
2216         return 0;
2217 }
2218
2219 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2220 {
2221         int rc = 0;
2222         struct proc_dir_entry *p;
2223
2224         afinfo->seq_fops.open           = tcp_seq_open;
2225         afinfo->seq_fops.read           = seq_read;
2226         afinfo->seq_fops.llseek         = seq_lseek;
2227         afinfo->seq_fops.release        = seq_release_net;
2228
2229         afinfo->seq_ops.start           = tcp_seq_start;
2230         afinfo->seq_ops.next            = tcp_seq_next;
2231         afinfo->seq_ops.stop            = tcp_seq_stop;
2232
2233         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2234                              &afinfo->seq_fops, afinfo);
2235         if (!p)
2236                 rc = -ENOMEM;
2237         return rc;
2238 }
2239
2240 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2241 {
2242         proc_net_remove(net, afinfo->name);
2243 }
2244
2245 static void get_openreq4(struct sock *sk, struct request_sock *req,
2246                          struct seq_file *f, int i, int uid, int *len)
2247 {
2248         const struct inet_request_sock *ireq = inet_rsk(req);
2249         int ttd = req->expires - jiffies;
2250
2251         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2252                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2253                 i,
2254                 ireq->loc_addr,
2255                 ntohs(inet_sk(sk)->sport),
2256                 ireq->rmt_addr,
2257                 ntohs(ireq->rmt_port),
2258                 TCP_SYN_RECV,
2259                 0, 0, /* could print option size, but that is af dependent. */
2260                 1,    /* timers active (only the expire timer) */
2261                 jiffies_to_clock_t(ttd),
2262                 req->retrans,
2263                 uid,
2264                 0,  /* non standard timer */
2265                 0, /* open_requests have no inode */
2266                 atomic_read(&sk->sk_refcnt),
2267                 req,
2268                 len);
2269 }
2270
2271 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2272 {
2273         int timer_active;
2274         unsigned long timer_expires;
2275         struct tcp_sock *tp = tcp_sk(sk);
2276         const struct inet_connection_sock *icsk = inet_csk(sk);
2277         struct inet_sock *inet = inet_sk(sk);
2278         __be32 dest = inet->daddr;
2279         __be32 src = inet->rcv_saddr;
2280         __u16 destp = ntohs(inet->dport);
2281         __u16 srcp = ntohs(inet->sport);
2282
2283         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2284                 timer_active    = 1;
2285                 timer_expires   = icsk->icsk_timeout;
2286         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2287                 timer_active    = 4;
2288                 timer_expires   = icsk->icsk_timeout;
2289         } else if (timer_pending(&sk->sk_timer)) {
2290                 timer_active    = 2;
2291                 timer_expires   = sk->sk_timer.expires;
2292         } else {
2293                 timer_active    = 0;
2294                 timer_expires = jiffies;
2295         }
2296
2297         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2298                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2299                 i, src, srcp, dest, destp, sk->sk_state,
2300                 tp->write_seq - tp->snd_una,
2301                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2302                                              (tp->rcv_nxt - tp->copied_seq),
2303                 timer_active,
2304                 jiffies_to_clock_t(timer_expires - jiffies),
2305                 icsk->icsk_retransmits,
2306                 sock_i_uid(sk),
2307                 icsk->icsk_probes_out,
2308                 sock_i_ino(sk),
2309                 atomic_read(&sk->sk_refcnt), sk,
2310                 icsk->icsk_rto,
2311                 icsk->icsk_ack.ato,
2312                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2313                 tp->snd_cwnd,
2314                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2315                 len);
2316 }
2317
2318 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2319                                struct seq_file *f, int i, int *len)
2320 {
2321         __be32 dest, src;
2322         __u16 destp, srcp;
2323         int ttd = tw->tw_ttd - jiffies;
2324
2325         if (ttd < 0)
2326                 ttd = 0;
2327
2328         dest  = tw->tw_daddr;
2329         src   = tw->tw_rcv_saddr;
2330         destp = ntohs(tw->tw_dport);
2331         srcp  = ntohs(tw->tw_sport);
2332
2333         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2334                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2335                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2336                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2337                 atomic_read(&tw->tw_refcnt), tw, len);
2338 }
2339
2340 #define TMPSZ 150
2341
2342 static int tcp4_seq_show(struct seq_file *seq, void *v)
2343 {
2344         struct tcp_iter_state* st;
2345         int len;
2346
2347         if (v == SEQ_START_TOKEN) {
2348                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2349                            "  sl  local_address rem_address   st tx_queue "
2350                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2351                            "inode");
2352                 goto out;
2353         }
2354         st = seq->private;
2355
2356         switch (st->state) {
2357         case TCP_SEQ_STATE_LISTENING:
2358         case TCP_SEQ_STATE_ESTABLISHED:
2359                 get_tcp4_sock(v, seq, st->num, &len);
2360                 break;
2361         case TCP_SEQ_STATE_OPENREQ:
2362                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2363                 break;
2364         case TCP_SEQ_STATE_TIME_WAIT:
2365                 get_timewait4_sock(v, seq, st->num, &len);
2366                 break;
2367         }
2368         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2369 out:
2370         return 0;
2371 }
2372
2373 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2374         .name           = "tcp",
2375         .family         = AF_INET,
2376         .seq_fops       = {
2377                 .owner          = THIS_MODULE,
2378         },
2379         .seq_ops        = {
2380                 .show           = tcp4_seq_show,
2381         },
2382 };
2383
2384 static int tcp4_proc_init_net(struct net *net)
2385 {
2386         return tcp_proc_register(net, &tcp4_seq_afinfo);
2387 }
2388
2389 static void tcp4_proc_exit_net(struct net *net)
2390 {
2391         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2392 }
2393
2394 static struct pernet_operations tcp4_net_ops = {
2395         .init = tcp4_proc_init_net,
2396         .exit = tcp4_proc_exit_net,
2397 };
2398
2399 int __init tcp4_proc_init(void)
2400 {
2401         return register_pernet_subsys(&tcp4_net_ops);
2402 }
2403
2404 void tcp4_proc_exit(void)
2405 {
2406         unregister_pernet_subsys(&tcp4_net_ops);
2407 }
2408 #endif /* CONFIG_PROC_FS */
2409
2410 struct proto tcp_prot = {
2411         .name                   = "TCP",
2412         .owner                  = THIS_MODULE,
2413         .close                  = tcp_close,
2414         .connect                = tcp_v4_connect,
2415         .disconnect             = tcp_disconnect,
2416         .accept                 = inet_csk_accept,
2417         .ioctl                  = tcp_ioctl,
2418         .init                   = tcp_v4_init_sock,
2419         .destroy                = tcp_v4_destroy_sock,
2420         .shutdown               = tcp_shutdown,
2421         .setsockopt             = tcp_setsockopt,
2422         .getsockopt             = tcp_getsockopt,
2423         .recvmsg                = tcp_recvmsg,
2424         .backlog_rcv            = tcp_v4_do_rcv,
2425         .hash                   = inet_hash,
2426         .unhash                 = inet_unhash,
2427         .get_port               = inet_csk_get_port,
2428         .enter_memory_pressure  = tcp_enter_memory_pressure,
2429         .sockets_allocated      = &tcp_sockets_allocated,
2430         .orphan_count           = &tcp_orphan_count,
2431         .memory_allocated       = &tcp_memory_allocated,
2432         .memory_pressure        = &tcp_memory_pressure,
2433         .sysctl_mem             = sysctl_tcp_mem,
2434         .sysctl_wmem            = sysctl_tcp_wmem,
2435         .sysctl_rmem            = sysctl_tcp_rmem,
2436         .max_header             = MAX_TCP_HEADER,
2437         .obj_size               = sizeof(struct tcp_sock),
2438         .twsk_prot              = &tcp_timewait_sock_ops,
2439         .rsk_prot               = &tcp_request_sock_ops,
2440         .h.hashinfo             = &tcp_hashinfo,
2441 #ifdef CONFIG_COMPAT
2442         .compat_setsockopt      = compat_tcp_setsockopt,
2443         .compat_getsockopt      = compat_tcp_getsockopt,
2444 #endif
2445 };
2446
2447
2448 static int __net_init tcp_sk_init(struct net *net)
2449 {
2450         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2451                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2452 }
2453
2454 static void __net_exit tcp_sk_exit(struct net *net)
2455 {
2456         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2457 }
2458
2459 static struct pernet_operations __net_initdata tcp_sk_ops = {
2460        .init = tcp_sk_init,
2461        .exit = tcp_sk_exit,
2462 };
2463
2464 void __init tcp_v4_init(void)
2465 {
2466         if (register_pernet_device(&tcp_sk_ops))
2467                 panic("Failed to create the TCP control socket.\n");
2468 }
2469
2470 EXPORT_SYMBOL(ipv4_specific);
2471 EXPORT_SYMBOL(tcp_hashinfo);
2472 EXPORT_SYMBOL(tcp_prot);
2473 EXPORT_SYMBOL(tcp_v4_conn_request);
2474 EXPORT_SYMBOL(tcp_v4_connect);
2475 EXPORT_SYMBOL(tcp_v4_do_rcv);
2476 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2477 EXPORT_SYMBOL(tcp_v4_send_check);
2478 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2479
2480 #ifdef CONFIG_PROC_FS
2481 EXPORT_SYMBOL(tcp_proc_register);
2482 EXPORT_SYMBOL(tcp_proc_unregister);
2483 #endif
2484 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2485