Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87
88
89 #ifdef CONFIG_TCP_MD5SIG
90 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
91                                                    __be32 addr);
92 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
93                                    __be32 saddr, __be32 daddr,
94                                    struct tcphdr *th, int protocol,
95                                    unsigned int tcplen);
96 #endif
97
98 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
99         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
100         .lhash_users = ATOMIC_INIT(0),
101         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
102 };
103
104 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
105 {
106         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
107                                           ip_hdr(skb)->saddr,
108                                           tcp_hdr(skb)->dest,
109                                           tcp_hdr(skb)->source);
110 }
111
112 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113 {
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116
117         /* With PAWS, it is safe from the viewpoint
118            of data integrity. Even without PAWS it is safe provided sequence
119            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
120
121            Actually, the idea is close to VJ's one, only timestamp cache is
122            held not per host, but per port pair and TW bucket is used as state
123            holder.
124
125            If TW bucket has been already destroyed we fall back to VJ's scheme
126            and use initial timestamp retrieved from peer table.
127          */
128         if (tcptw->tw_ts_recent_stamp &&
129             (twp == NULL || (sysctl_tcp_tw_reuse &&
130                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
131                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
132                 if (tp->write_seq == 0)
133                         tp->write_seq = 1;
134                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
135                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
136                 sock_hold(sktw);
137                 return 1;
138         }
139
140         return 0;
141 }
142
143 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
144
145 /* This will initiate an outgoing connection. */
146 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
147 {
148         struct inet_sock *inet = inet_sk(sk);
149         struct tcp_sock *tp = tcp_sk(sk);
150         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
151         struct rtable *rt;
152         __be32 daddr, nexthop;
153         int tmp;
154         int err;
155
156         if (addr_len < sizeof(struct sockaddr_in))
157                 return -EINVAL;
158
159         if (usin->sin_family != AF_INET)
160                 return -EAFNOSUPPORT;
161
162         nexthop = daddr = usin->sin_addr.s_addr;
163         if (inet->opt && inet->opt->srr) {
164                 if (!daddr)
165                         return -EINVAL;
166                 nexthop = inet->opt->faddr;
167         }
168
169         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
170                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
171                                IPPROTO_TCP,
172                                inet->sport, usin->sin_port, sk, 1);
173         if (tmp < 0) {
174                 if (tmp == -ENETUNREACH)
175                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
176                 return tmp;
177         }
178
179         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
180                 ip_rt_put(rt);
181                 return -ENETUNREACH;
182         }
183
184         if (!inet->opt || !inet->opt->srr)
185                 daddr = rt->rt_dst;
186
187         if (!inet->saddr)
188                 inet->saddr = rt->rt_src;
189         inet->rcv_saddr = inet->saddr;
190
191         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
192                 /* Reset inherited state */
193                 tp->rx_opt.ts_recent       = 0;
194                 tp->rx_opt.ts_recent_stamp = 0;
195                 tp->write_seq              = 0;
196         }
197
198         if (tcp_death_row.sysctl_tw_recycle &&
199             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
200                 struct inet_peer *peer = rt_get_peer(rt);
201                 /*
202                  * VJ's idea. We save last timestamp seen from
203                  * the destination in peer table, when entering state
204                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
205                  * when trying new connection.
206                  */
207                 if (peer != NULL &&
208                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
209                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
210                         tp->rx_opt.ts_recent = peer->tcp_ts;
211                 }
212         }
213
214         inet->dport = usin->sin_port;
215         inet->daddr = daddr;
216
217         inet_csk(sk)->icsk_ext_hdr_len = 0;
218         if (inet->opt)
219                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
220
221         tp->rx_opt.mss_clamp = 536;
222
223         /* Socket identity is still unknown (sport may be zero).
224          * However we set state to SYN-SENT and not releasing socket
225          * lock select source port, enter ourselves into the hash tables and
226          * complete initialization after this.
227          */
228         tcp_set_state(sk, TCP_SYN_SENT);
229         err = inet_hash_connect(&tcp_death_row, sk);
230         if (err)
231                 goto failure;
232
233         err = ip_route_newports(&rt, IPPROTO_TCP,
234                                 inet->sport, inet->dport, sk);
235         if (err)
236                 goto failure;
237
238         /* OK, now commit destination to socket.  */
239         sk->sk_gso_type = SKB_GSO_TCPV4;
240         sk_setup_caps(sk, &rt->u.dst);
241
242         if (!tp->write_seq)
243                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
244                                                            inet->daddr,
245                                                            inet->sport,
246                                                            usin->sin_port);
247
248         inet->id = tp->write_seq ^ jiffies;
249
250         err = tcp_connect(sk);
251         rt = NULL;
252         if (err)
253                 goto failure;
254
255         return 0;
256
257 failure:
258         /*
259          * This unhashes the socket and releases the local port,
260          * if necessary.
261          */
262         tcp_set_state(sk, TCP_CLOSE);
263         ip_rt_put(rt);
264         sk->sk_route_caps = 0;
265         inet->dport = 0;
266         return err;
267 }
268
269 /*
270  * This routine does path mtu discovery as defined in RFC1191.
271  */
272 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
273 {
274         struct dst_entry *dst;
275         struct inet_sock *inet = inet_sk(sk);
276
277         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
278          * send out by Linux are always <576bytes so they should go through
279          * unfragmented).
280          */
281         if (sk->sk_state == TCP_LISTEN)
282                 return;
283
284         /* We don't check in the destentry if pmtu discovery is forbidden
285          * on this route. We just assume that no packet_to_big packets
286          * are send back when pmtu discovery is not active.
287          * There is a small race when the user changes this flag in the
288          * route, but I think that's acceptable.
289          */
290         if ((dst = __sk_dst_check(sk, 0)) == NULL)
291                 return;
292
293         dst->ops->update_pmtu(dst, mtu);
294
295         /* Something is about to be wrong... Remember soft error
296          * for the case, if this connection will not able to recover.
297          */
298         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
299                 sk->sk_err_soft = EMSGSIZE;
300
301         mtu = dst_mtu(dst);
302
303         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
304             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
305                 tcp_sync_mss(sk, mtu);
306
307                 /* Resend the TCP packet because it's
308                  * clear that the old packet has been
309                  * dropped. This is the new "fast" path mtu
310                  * discovery.
311                  */
312                 tcp_simple_retransmit(sk);
313         } /* else let the usual retransmit timer handle it */
314 }
315
316 /*
317  * This routine is called by the ICMP module when it gets some
318  * sort of error condition.  If err < 0 then the socket should
319  * be closed and the error returned to the user.  If err > 0
320  * it's just the icmp type << 8 | icmp code.  After adjustment
321  * header points to the first 8 bytes of the tcp header.  We need
322  * to find the appropriate port.
323  *
324  * The locking strategy used here is very "optimistic". When
325  * someone else accesses the socket the ICMP is just dropped
326  * and for some paths there is no check at all.
327  * A more general error queue to queue errors for later handling
328  * is probably better.
329  *
330  */
331
332 void tcp_v4_err(struct sk_buff *skb, u32 info)
333 {
334         struct iphdr *iph = (struct iphdr *)skb->data;
335         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
336         struct tcp_sock *tp;
337         struct inet_sock *inet;
338         const int type = icmp_hdr(skb)->type;
339         const int code = icmp_hdr(skb)->code;
340         struct sock *sk;
341         __u32 seq;
342         int err;
343
344         if (skb->len < (iph->ihl << 2) + 8) {
345                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
346                 return;
347         }
348
349         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
350                         iph->saddr, th->source, inet_iif(skb));
351         if (!sk) {
352                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
353                 return;
354         }
355         if (sk->sk_state == TCP_TIME_WAIT) {
356                 inet_twsk_put(inet_twsk(sk));
357                 return;
358         }
359
360         bh_lock_sock(sk);
361         /* If too many ICMPs get dropped on busy
362          * servers this needs to be solved differently.
363          */
364         if (sock_owned_by_user(sk))
365                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
366
367         if (sk->sk_state == TCP_CLOSE)
368                 goto out;
369
370         tp = tcp_sk(sk);
371         seq = ntohl(th->seq);
372         if (sk->sk_state != TCP_LISTEN &&
373             !between(seq, tp->snd_una, tp->snd_nxt)) {
374                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
375                 goto out;
376         }
377
378         switch (type) {
379         case ICMP_SOURCE_QUENCH:
380                 /* Just silently ignore these. */
381                 goto out;
382         case ICMP_PARAMETERPROB:
383                 err = EPROTO;
384                 break;
385         case ICMP_DEST_UNREACH:
386                 if (code > NR_ICMP_UNREACH)
387                         goto out;
388
389                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
390                         if (!sock_owned_by_user(sk))
391                                 do_pmtu_discovery(sk, iph, info);
392                         goto out;
393                 }
394
395                 err = icmp_err_convert[code].errno;
396                 break;
397         case ICMP_TIME_EXCEEDED:
398                 err = EHOSTUNREACH;
399                 break;
400         default:
401                 goto out;
402         }
403
404         switch (sk->sk_state) {
405                 struct request_sock *req, **prev;
406         case TCP_LISTEN:
407                 if (sock_owned_by_user(sk))
408                         goto out;
409
410                 req = inet_csk_search_req(sk, &prev, th->dest,
411                                           iph->daddr, iph->saddr);
412                 if (!req)
413                         goto out;
414
415                 /* ICMPs are not backlogged, hence we cannot get
416                    an established socket here.
417                  */
418                 BUG_TRAP(!req->sk);
419
420                 if (seq != tcp_rsk(req)->snt_isn) {
421                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
422                         goto out;
423                 }
424
425                 /*
426                  * Still in SYN_RECV, just remove it silently.
427                  * There is no good way to pass the error to the newly
428                  * created socket, and POSIX does not want network
429                  * errors returned from accept().
430                  */
431                 inet_csk_reqsk_queue_drop(sk, req, prev);
432                 goto out;
433
434         case TCP_SYN_SENT:
435         case TCP_SYN_RECV:  /* Cannot happen.
436                                It can f.e. if SYNs crossed.
437                              */
438                 if (!sock_owned_by_user(sk)) {
439                         sk->sk_err = err;
440
441                         sk->sk_error_report(sk);
442
443                         tcp_done(sk);
444                 } else {
445                         sk->sk_err_soft = err;
446                 }
447                 goto out;
448         }
449
450         /* If we've already connected we will keep trying
451          * until we time out, or the user gives up.
452          *
453          * rfc1122 4.2.3.9 allows to consider as hard errors
454          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
455          * but it is obsoleted by pmtu discovery).
456          *
457          * Note, that in modern internet, where routing is unreliable
458          * and in each dark corner broken firewalls sit, sending random
459          * errors ordered by their masters even this two messages finally lose
460          * their original sense (even Linux sends invalid PORT_UNREACHs)
461          *
462          * Now we are in compliance with RFCs.
463          *                                                      --ANK (980905)
464          */
465
466         inet = inet_sk(sk);
467         if (!sock_owned_by_user(sk) && inet->recverr) {
468                 sk->sk_err = err;
469                 sk->sk_error_report(sk);
470         } else  { /* Only an error on timeout */
471                 sk->sk_err_soft = err;
472         }
473
474 out:
475         bh_unlock_sock(sk);
476         sock_put(sk);
477 }
478
479 /* This routine computes an IPv4 TCP checksum. */
480 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
481 {
482         struct inet_sock *inet = inet_sk(sk);
483         struct tcphdr *th = tcp_hdr(skb);
484
485         if (skb->ip_summed == CHECKSUM_PARTIAL) {
486                 th->check = ~tcp_v4_check(len, inet->saddr,
487                                           inet->daddr, 0);
488                 skb->csum_start = skb_transport_header(skb) - skb->head;
489                 skb->csum_offset = offsetof(struct tcphdr, check);
490         } else {
491                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
492                                          csum_partial((char *)th,
493                                                       th->doff << 2,
494                                                       skb->csum));
495         }
496 }
497
498 int tcp_v4_gso_send_check(struct sk_buff *skb)
499 {
500         const struct iphdr *iph;
501         struct tcphdr *th;
502
503         if (!pskb_may_pull(skb, sizeof(*th)))
504                 return -EINVAL;
505
506         iph = ip_hdr(skb);
507         th = tcp_hdr(skb);
508
509         th->check = 0;
510         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
511         skb->csum_start = skb_transport_header(skb) - skb->head;
512         skb->csum_offset = offsetof(struct tcphdr, check);
513         skb->ip_summed = CHECKSUM_PARTIAL;
514         return 0;
515 }
516
517 /*
518  *      This routine will send an RST to the other tcp.
519  *
520  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
521  *                    for reset.
522  *      Answer: if a packet caused RST, it is not for a socket
523  *              existing in our system, if it is matched to a socket,
524  *              it is just duplicate segment or bug in other side's TCP.
525  *              So that we build reply only basing on parameters
526  *              arrived with segment.
527  *      Exception: precedence violation. We do not implement it in any case.
528  */
529
530 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
531 {
532         struct tcphdr *th = tcp_hdr(skb);
533         struct {
534                 struct tcphdr th;
535 #ifdef CONFIG_TCP_MD5SIG
536                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
537 #endif
538         } rep;
539         struct ip_reply_arg arg;
540 #ifdef CONFIG_TCP_MD5SIG
541         struct tcp_md5sig_key *key;
542 #endif
543
544         /* Never send a reset in response to a reset. */
545         if (th->rst)
546                 return;
547
548         if (skb->rtable->rt_type != RTN_LOCAL)
549                 return;
550
551         /* Swap the send and the receive. */
552         memset(&rep, 0, sizeof(rep));
553         rep.th.dest   = th->source;
554         rep.th.source = th->dest;
555         rep.th.doff   = sizeof(struct tcphdr) / 4;
556         rep.th.rst    = 1;
557
558         if (th->ack) {
559                 rep.th.seq = th->ack_seq;
560         } else {
561                 rep.th.ack = 1;
562                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
563                                        skb->len - (th->doff << 2));
564         }
565
566         memset(&arg, 0, sizeof(arg));
567         arg.iov[0].iov_base = (unsigned char *)&rep;
568         arg.iov[0].iov_len  = sizeof(rep.th);
569
570 #ifdef CONFIG_TCP_MD5SIG
571         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
572         if (key) {
573                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
574                                    (TCPOPT_NOP << 16) |
575                                    (TCPOPT_MD5SIG << 8) |
576                                    TCPOLEN_MD5SIG);
577                 /* Update length and the length the header thinks exists */
578                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
579                 rep.th.doff = arg.iov[0].iov_len / 4;
580
581                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
582                                         key,
583                                         ip_hdr(skb)->daddr,
584                                         ip_hdr(skb)->saddr,
585                                         &rep.th, IPPROTO_TCP,
586                                         arg.iov[0].iov_len);
587         }
588 #endif
589         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
590                                       ip_hdr(skb)->saddr, /* XXX */
591                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
592         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
593
594         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
595                       &arg, arg.iov[0].iov_len);
596
597         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
598         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
599 }
600
601 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
602    outside socket context is ugly, certainly. What can I do?
603  */
604
605 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
606                             struct sk_buff *skb, u32 seq, u32 ack,
607                             u32 win, u32 ts)
608 {
609         struct tcphdr *th = tcp_hdr(skb);
610         struct {
611                 struct tcphdr th;
612                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
613 #ifdef CONFIG_TCP_MD5SIG
614                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
615 #endif
616                         ];
617         } rep;
618         struct ip_reply_arg arg;
619 #ifdef CONFIG_TCP_MD5SIG
620         struct tcp_md5sig_key *key;
621         struct tcp_md5sig_key tw_key;
622 #endif
623
624         memset(&rep.th, 0, sizeof(struct tcphdr));
625         memset(&arg, 0, sizeof(arg));
626
627         arg.iov[0].iov_base = (unsigned char *)&rep;
628         arg.iov[0].iov_len  = sizeof(rep.th);
629         if (ts) {
630                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
631                                    (TCPOPT_TIMESTAMP << 8) |
632                                    TCPOLEN_TIMESTAMP);
633                 rep.opt[1] = htonl(tcp_time_stamp);
634                 rep.opt[2] = htonl(ts);
635                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
636         }
637
638         /* Swap the send and the receive. */
639         rep.th.dest    = th->source;
640         rep.th.source  = th->dest;
641         rep.th.doff    = arg.iov[0].iov_len / 4;
642         rep.th.seq     = htonl(seq);
643         rep.th.ack_seq = htonl(ack);
644         rep.th.ack     = 1;
645         rep.th.window  = htons(win);
646
647 #ifdef CONFIG_TCP_MD5SIG
648         /*
649          * The SKB holds an imcoming packet, but may not have a valid ->sk
650          * pointer. This is especially the case when we're dealing with a
651          * TIME_WAIT ack, because the sk structure is long gone, and only
652          * the tcp_timewait_sock remains. So the md5 key is stashed in that
653          * structure, and we use it in preference.  I believe that (twsk ||
654          * skb->sk) holds true, but we program defensively.
655          */
656         if (!twsk && skb->sk) {
657                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
658         } else if (twsk && twsk->tw_md5_keylen) {
659                 tw_key.key = twsk->tw_md5_key;
660                 tw_key.keylen = twsk->tw_md5_keylen;
661                 key = &tw_key;
662         } else
663                 key = NULL;
664
665         if (key) {
666                 int offset = (ts) ? 3 : 0;
667
668                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
669                                           (TCPOPT_NOP << 16) |
670                                           (TCPOPT_MD5SIG << 8) |
671                                           TCPOLEN_MD5SIG);
672                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
673                 rep.th.doff = arg.iov[0].iov_len/4;
674
675                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
676                                         key,
677                                         ip_hdr(skb)->daddr,
678                                         ip_hdr(skb)->saddr,
679                                         &rep.th, IPPROTO_TCP,
680                                         arg.iov[0].iov_len);
681         }
682 #endif
683         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
684                                       ip_hdr(skb)->saddr, /* XXX */
685                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
686         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
687         if (twsk)
688                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
689
690         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
691                       &arg, arg.iov[0].iov_len);
692
693         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
694 }
695
696 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
697 {
698         struct inet_timewait_sock *tw = inet_twsk(sk);
699         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
700
701         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
702                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
703                         tcptw->tw_ts_recent);
704
705         inet_twsk_put(tw);
706 }
707
708 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
709                                   struct request_sock *req)
710 {
711         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
712                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
713                         req->ts_recent);
714 }
715
716 /*
717  *      Send a SYN-ACK after having received a SYN.
718  *      This still operates on a request_sock only, not on a big
719  *      socket.
720  */
721 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
722                                 struct dst_entry *dst)
723 {
724         const struct inet_request_sock *ireq = inet_rsk(req);
725         int err = -1;
726         struct sk_buff * skb;
727
728         /* First, grab a route. */
729         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
730                 return -1;
731
732         skb = tcp_make_synack(sk, dst, req);
733
734         if (skb) {
735                 struct tcphdr *th = tcp_hdr(skb);
736
737                 th->check = tcp_v4_check(skb->len,
738                                          ireq->loc_addr,
739                                          ireq->rmt_addr,
740                                          csum_partial((char *)th, skb->len,
741                                                       skb->csum));
742
743                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
744                                             ireq->rmt_addr,
745                                             ireq->opt);
746                 err = net_xmit_eval(err);
747         }
748
749         dst_release(dst);
750         return err;
751 }
752
753 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
754 {
755         return __tcp_v4_send_synack(sk, req, NULL);
756 }
757
758 /*
759  *      IPv4 request_sock destructor.
760  */
761 static void tcp_v4_reqsk_destructor(struct request_sock *req)
762 {
763         kfree(inet_rsk(req)->opt);
764 }
765
766 #ifdef CONFIG_SYN_COOKIES
767 static void syn_flood_warning(struct sk_buff *skb)
768 {
769         static unsigned long warntime;
770
771         if (time_after(jiffies, (warntime + HZ * 60))) {
772                 warntime = jiffies;
773                 printk(KERN_INFO
774                        "possible SYN flooding on port %d. Sending cookies.\n",
775                        ntohs(tcp_hdr(skb)->dest));
776         }
777 }
778 #endif
779
780 /*
781  * Save and compile IPv4 options into the request_sock if needed.
782  */
783 static struct ip_options *tcp_v4_save_options(struct sock *sk,
784                                               struct sk_buff *skb)
785 {
786         struct ip_options *opt = &(IPCB(skb)->opt);
787         struct ip_options *dopt = NULL;
788
789         if (opt && opt->optlen) {
790                 int opt_size = optlength(opt);
791                 dopt = kmalloc(opt_size, GFP_ATOMIC);
792                 if (dopt) {
793                         if (ip_options_echo(dopt, skb)) {
794                                 kfree(dopt);
795                                 dopt = NULL;
796                         }
797                 }
798         }
799         return dopt;
800 }
801
802 #ifdef CONFIG_TCP_MD5SIG
803 /*
804  * RFC2385 MD5 checksumming requires a mapping of
805  * IP address->MD5 Key.
806  * We need to maintain these in the sk structure.
807  */
808
809 /* Find the Key structure for an address.  */
810 static struct tcp_md5sig_key *
811                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
812 {
813         struct tcp_sock *tp = tcp_sk(sk);
814         int i;
815
816         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
817                 return NULL;
818         for (i = 0; i < tp->md5sig_info->entries4; i++) {
819                 if (tp->md5sig_info->keys4[i].addr == addr)
820                         return &tp->md5sig_info->keys4[i].base;
821         }
822         return NULL;
823 }
824
825 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
826                                          struct sock *addr_sk)
827 {
828         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
829 }
830
831 EXPORT_SYMBOL(tcp_v4_md5_lookup);
832
833 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
834                                                       struct request_sock *req)
835 {
836         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
837 }
838
839 /* This can be called on a newly created socket, from other files */
840 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
841                       u8 *newkey, u8 newkeylen)
842 {
843         /* Add Key to the list */
844         struct tcp_md5sig_key *key;
845         struct tcp_sock *tp = tcp_sk(sk);
846         struct tcp4_md5sig_key *keys;
847
848         key = tcp_v4_md5_do_lookup(sk, addr);
849         if (key) {
850                 /* Pre-existing entry - just update that one. */
851                 kfree(key->key);
852                 key->key = newkey;
853                 key->keylen = newkeylen;
854         } else {
855                 struct tcp_md5sig_info *md5sig;
856
857                 if (!tp->md5sig_info) {
858                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
859                                                   GFP_ATOMIC);
860                         if (!tp->md5sig_info) {
861                                 kfree(newkey);
862                                 return -ENOMEM;
863                         }
864                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
865                 }
866                 if (tcp_alloc_md5sig_pool() == NULL) {
867                         kfree(newkey);
868                         return -ENOMEM;
869                 }
870                 md5sig = tp->md5sig_info;
871
872                 if (md5sig->alloced4 == md5sig->entries4) {
873                         keys = kmalloc((sizeof(*keys) *
874                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
875                         if (!keys) {
876                                 kfree(newkey);
877                                 tcp_free_md5sig_pool();
878                                 return -ENOMEM;
879                         }
880
881                         if (md5sig->entries4)
882                                 memcpy(keys, md5sig->keys4,
883                                        sizeof(*keys) * md5sig->entries4);
884
885                         /* Free old key list, and reference new one */
886                         kfree(md5sig->keys4);
887                         md5sig->keys4 = keys;
888                         md5sig->alloced4++;
889                 }
890                 md5sig->entries4++;
891                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
892                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
893                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
894         }
895         return 0;
896 }
897
898 EXPORT_SYMBOL(tcp_v4_md5_do_add);
899
900 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
901                                u8 *newkey, u8 newkeylen)
902 {
903         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
904                                  newkey, newkeylen);
905 }
906
907 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
908 {
909         struct tcp_sock *tp = tcp_sk(sk);
910         int i;
911
912         for (i = 0; i < tp->md5sig_info->entries4; i++) {
913                 if (tp->md5sig_info->keys4[i].addr == addr) {
914                         /* Free the key */
915                         kfree(tp->md5sig_info->keys4[i].base.key);
916                         tp->md5sig_info->entries4--;
917
918                         if (tp->md5sig_info->entries4 == 0) {
919                                 kfree(tp->md5sig_info->keys4);
920                                 tp->md5sig_info->keys4 = NULL;
921                                 tp->md5sig_info->alloced4 = 0;
922                         } else if (tp->md5sig_info->entries4 != i) {
923                                 /* Need to do some manipulation */
924                                 memmove(&tp->md5sig_info->keys4[i],
925                                         &tp->md5sig_info->keys4[i+1],
926                                         (tp->md5sig_info->entries4 - i) *
927                                          sizeof(struct tcp4_md5sig_key));
928                         }
929                         tcp_free_md5sig_pool();
930                         return 0;
931                 }
932         }
933         return -ENOENT;
934 }
935
936 EXPORT_SYMBOL(tcp_v4_md5_do_del);
937
938 static void tcp_v4_clear_md5_list(struct sock *sk)
939 {
940         struct tcp_sock *tp = tcp_sk(sk);
941
942         /* Free each key, then the set of key keys,
943          * the crypto element, and then decrement our
944          * hold on the last resort crypto.
945          */
946         if (tp->md5sig_info->entries4) {
947                 int i;
948                 for (i = 0; i < tp->md5sig_info->entries4; i++)
949                         kfree(tp->md5sig_info->keys4[i].base.key);
950                 tp->md5sig_info->entries4 = 0;
951                 tcp_free_md5sig_pool();
952         }
953         if (tp->md5sig_info->keys4) {
954                 kfree(tp->md5sig_info->keys4);
955                 tp->md5sig_info->keys4 = NULL;
956                 tp->md5sig_info->alloced4  = 0;
957         }
958 }
959
960 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
961                                  int optlen)
962 {
963         struct tcp_md5sig cmd;
964         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
965         u8 *newkey;
966
967         if (optlen < sizeof(cmd))
968                 return -EINVAL;
969
970         if (copy_from_user(&cmd, optval, sizeof(cmd)))
971                 return -EFAULT;
972
973         if (sin->sin_family != AF_INET)
974                 return -EINVAL;
975
976         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
977                 if (!tcp_sk(sk)->md5sig_info)
978                         return -ENOENT;
979                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
980         }
981
982         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
983                 return -EINVAL;
984
985         if (!tcp_sk(sk)->md5sig_info) {
986                 struct tcp_sock *tp = tcp_sk(sk);
987                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
988
989                 if (!p)
990                         return -EINVAL;
991
992                 tp->md5sig_info = p;
993                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
994         }
995
996         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
997         if (!newkey)
998                 return -ENOMEM;
999         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1000                                  newkey, cmd.tcpm_keylen);
1001 }
1002
1003 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1004                                    __be32 saddr, __be32 daddr,
1005                                    struct tcphdr *th, int protocol,
1006                                    unsigned int tcplen)
1007 {
1008         struct scatterlist sg[4];
1009         __u16 data_len;
1010         int block = 0;
1011         __sum16 old_checksum;
1012         struct tcp_md5sig_pool *hp;
1013         struct tcp4_pseudohdr *bp;
1014         struct hash_desc *desc;
1015         int err;
1016         unsigned int nbytes = 0;
1017
1018         /*
1019          * Okay, so RFC2385 is turned on for this connection,
1020          * so we need to generate the MD5 hash for the packet now.
1021          */
1022
1023         hp = tcp_get_md5sig_pool();
1024         if (!hp)
1025                 goto clear_hash_noput;
1026
1027         bp = &hp->md5_blk.ip4;
1028         desc = &hp->md5_desc;
1029
1030         /*
1031          * 1. the TCP pseudo-header (in the order: source IP address,
1032          * destination IP address, zero-padded protocol number, and
1033          * segment length)
1034          */
1035         bp->saddr = saddr;
1036         bp->daddr = daddr;
1037         bp->pad = 0;
1038         bp->protocol = protocol;
1039         bp->len = htons(tcplen);
1040
1041         sg_init_table(sg, 4);
1042
1043         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1044         nbytes += sizeof(*bp);
1045
1046         /* 2. the TCP header, excluding options, and assuming a
1047          * checksum of zero/
1048          */
1049         old_checksum = th->check;
1050         th->check = 0;
1051         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1052         nbytes += sizeof(struct tcphdr);
1053
1054         /* 3. the TCP segment data (if any) */
1055         data_len = tcplen - (th->doff << 2);
1056         if (data_len > 0) {
1057                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1058                 sg_set_buf(&sg[block++], data, data_len);
1059                 nbytes += data_len;
1060         }
1061
1062         /* 4. an independently-specified key or password, known to both
1063          * TCPs and presumably connection-specific
1064          */
1065         sg_set_buf(&sg[block++], key->key, key->keylen);
1066         nbytes += key->keylen;
1067
1068         sg_mark_end(&sg[block - 1]);
1069
1070         /* Now store the Hash into the packet */
1071         err = crypto_hash_init(desc);
1072         if (err)
1073                 goto clear_hash;
1074         err = crypto_hash_update(desc, sg, nbytes);
1075         if (err)
1076                 goto clear_hash;
1077         err = crypto_hash_final(desc, md5_hash);
1078         if (err)
1079                 goto clear_hash;
1080
1081         /* Reset header, and free up the crypto */
1082         tcp_put_md5sig_pool();
1083         th->check = old_checksum;
1084
1085 out:
1086         return 0;
1087 clear_hash:
1088         tcp_put_md5sig_pool();
1089 clear_hash_noput:
1090         memset(md5_hash, 0, 16);
1091         goto out;
1092 }
1093
1094 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1095                          struct sock *sk,
1096                          struct dst_entry *dst,
1097                          struct request_sock *req,
1098                          struct tcphdr *th, int protocol,
1099                          unsigned int tcplen)
1100 {
1101         __be32 saddr, daddr;
1102
1103         if (sk) {
1104                 saddr = inet_sk(sk)->saddr;
1105                 daddr = inet_sk(sk)->daddr;
1106         } else {
1107                 struct rtable *rt = (struct rtable *)dst;
1108                 BUG_ON(!rt);
1109                 saddr = rt->rt_src;
1110                 daddr = rt->rt_dst;
1111         }
1112         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1113                                        saddr, daddr,
1114                                        th, protocol, tcplen);
1115 }
1116
1117 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1118
1119 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1120 {
1121         /*
1122          * This gets called for each TCP segment that arrives
1123          * so we want to be efficient.
1124          * We have 3 drop cases:
1125          * o No MD5 hash and one expected.
1126          * o MD5 hash and we're not expecting one.
1127          * o MD5 hash and its wrong.
1128          */
1129         __u8 *hash_location = NULL;
1130         struct tcp_md5sig_key *hash_expected;
1131         const struct iphdr *iph = ip_hdr(skb);
1132         struct tcphdr *th = tcp_hdr(skb);
1133         int length = (th->doff << 2) - sizeof(struct tcphdr);
1134         int genhash;
1135         unsigned char *ptr;
1136         unsigned char newhash[16];
1137
1138         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1139
1140         /*
1141          * If the TCP option length is less than the TCP_MD5SIG
1142          * option length, then we can shortcut
1143          */
1144         if (length < TCPOLEN_MD5SIG) {
1145                 if (hash_expected)
1146                         return 1;
1147                 else
1148                         return 0;
1149         }
1150
1151         /* Okay, we can't shortcut - we have to grub through the options */
1152         ptr = (unsigned char *)(th + 1);
1153         while (length > 0) {
1154                 int opcode = *ptr++;
1155                 int opsize;
1156
1157                 switch (opcode) {
1158                 case TCPOPT_EOL:
1159                         goto done_opts;
1160                 case TCPOPT_NOP:
1161                         length--;
1162                         continue;
1163                 default:
1164                         opsize = *ptr++;
1165                         if (opsize < 2)
1166                                 goto done_opts;
1167                         if (opsize > length)
1168                                 goto done_opts;
1169
1170                         if (opcode == TCPOPT_MD5SIG) {
1171                                 hash_location = ptr;
1172                                 goto done_opts;
1173                         }
1174                 }
1175                 ptr += opsize-2;
1176                 length -= opsize;
1177         }
1178 done_opts:
1179         /* We've parsed the options - do we have a hash? */
1180         if (!hash_expected && !hash_location)
1181                 return 0;
1182
1183         if (hash_expected && !hash_location) {
1184                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1185                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1186                                NIPQUAD(iph->saddr), ntohs(th->source),
1187                                NIPQUAD(iph->daddr), ntohs(th->dest));
1188                 return 1;
1189         }
1190
1191         if (!hash_expected && hash_location) {
1192                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1193                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1194                                NIPQUAD(iph->saddr), ntohs(th->source),
1195                                NIPQUAD(iph->daddr), ntohs(th->dest));
1196                 return 1;
1197         }
1198
1199         /* Okay, so this is hash_expected and hash_location -
1200          * so we need to calculate the checksum.
1201          */
1202         genhash = tcp_v4_do_calc_md5_hash(newhash,
1203                                           hash_expected,
1204                                           iph->saddr, iph->daddr,
1205                                           th, sk->sk_protocol,
1206                                           skb->len);
1207
1208         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1209                 if (net_ratelimit()) {
1210                         printk(KERN_INFO "MD5 Hash failed for "
1211                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1212                                NIPQUAD(iph->saddr), ntohs(th->source),
1213                                NIPQUAD(iph->daddr), ntohs(th->dest),
1214                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1215                 }
1216                 return 1;
1217         }
1218         return 0;
1219 }
1220
1221 #endif
1222
1223 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1224         .family         =       PF_INET,
1225         .obj_size       =       sizeof(struct tcp_request_sock),
1226         .rtx_syn_ack    =       tcp_v4_send_synack,
1227         .send_ack       =       tcp_v4_reqsk_send_ack,
1228         .destructor     =       tcp_v4_reqsk_destructor,
1229         .send_reset     =       tcp_v4_send_reset,
1230 };
1231
1232 #ifdef CONFIG_TCP_MD5SIG
1233 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1234         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1235 };
1236 #endif
1237
1238 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1239         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1240         .twsk_unique    = tcp_twsk_unique,
1241         .twsk_destructor= tcp_twsk_destructor,
1242 };
1243
1244 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1245 {
1246         struct inet_request_sock *ireq;
1247         struct tcp_options_received tmp_opt;
1248         struct request_sock *req;
1249         __be32 saddr = ip_hdr(skb)->saddr;
1250         __be32 daddr = ip_hdr(skb)->daddr;
1251         __u32 isn = TCP_SKB_CB(skb)->when;
1252         struct dst_entry *dst = NULL;
1253 #ifdef CONFIG_SYN_COOKIES
1254         int want_cookie = 0;
1255 #else
1256 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1257 #endif
1258
1259         /* Never answer to SYNs send to broadcast or multicast */
1260         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1261                 goto drop;
1262
1263         /* TW buckets are converted to open requests without
1264          * limitations, they conserve resources and peer is
1265          * evidently real one.
1266          */
1267         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1268 #ifdef CONFIG_SYN_COOKIES
1269                 if (sysctl_tcp_syncookies) {
1270                         want_cookie = 1;
1271                 } else
1272 #endif
1273                 goto drop;
1274         }
1275
1276         /* Accept backlog is full. If we have already queued enough
1277          * of warm entries in syn queue, drop request. It is better than
1278          * clogging syn queue with openreqs with exponentially increasing
1279          * timeout.
1280          */
1281         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1282                 goto drop;
1283
1284         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1285         if (!req)
1286                 goto drop;
1287
1288 #ifdef CONFIG_TCP_MD5SIG
1289         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1290 #endif
1291
1292         tcp_clear_options(&tmp_opt);
1293         tmp_opt.mss_clamp = 536;
1294         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1295
1296         tcp_parse_options(skb, &tmp_opt, 0);
1297
1298         if (want_cookie && !tmp_opt.saw_tstamp)
1299                 tcp_clear_options(&tmp_opt);
1300
1301         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1302                 /* Some OSes (unknown ones, but I see them on web server, which
1303                  * contains information interesting only for windows'
1304                  * users) do not send their stamp in SYN. It is easy case.
1305                  * We simply do not advertise TS support.
1306                  */
1307                 tmp_opt.saw_tstamp = 0;
1308                 tmp_opt.tstamp_ok  = 0;
1309         }
1310         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1311
1312         tcp_openreq_init(req, &tmp_opt, skb);
1313
1314         if (security_inet_conn_request(sk, skb, req))
1315                 goto drop_and_free;
1316
1317         ireq = inet_rsk(req);
1318         ireq->loc_addr = daddr;
1319         ireq->rmt_addr = saddr;
1320         ireq->opt = tcp_v4_save_options(sk, skb);
1321         if (!want_cookie)
1322                 TCP_ECN_create_request(req, tcp_hdr(skb));
1323
1324         if (want_cookie) {
1325 #ifdef CONFIG_SYN_COOKIES
1326                 syn_flood_warning(skb);
1327                 req->cookie_ts = tmp_opt.tstamp_ok;
1328 #endif
1329                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1330         } else if (!isn) {
1331                 struct inet_peer *peer = NULL;
1332
1333                 /* VJ's idea. We save last timestamp seen
1334                  * from the destination in peer table, when entering
1335                  * state TIME-WAIT, and check against it before
1336                  * accepting new connection request.
1337                  *
1338                  * If "isn" is not zero, this request hit alive
1339                  * timewait bucket, so that all the necessary checks
1340                  * are made in the function processing timewait state.
1341                  */
1342                 if (tmp_opt.saw_tstamp &&
1343                     tcp_death_row.sysctl_tw_recycle &&
1344                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1345                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1346                     peer->v4daddr == saddr) {
1347                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1348                             (s32)(peer->tcp_ts - req->ts_recent) >
1349                                                         TCP_PAWS_WINDOW) {
1350                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1351                                 goto drop_and_release;
1352                         }
1353                 }
1354                 /* Kill the following clause, if you dislike this way. */
1355                 else if (!sysctl_tcp_syncookies &&
1356                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1357                           (sysctl_max_syn_backlog >> 2)) &&
1358                          (!peer || !peer->tcp_ts_stamp) &&
1359                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1360                         /* Without syncookies last quarter of
1361                          * backlog is filled with destinations,
1362                          * proven to be alive.
1363                          * It means that we continue to communicate
1364                          * to destinations, already remembered
1365                          * to the moment of synflood.
1366                          */
1367                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1368                                        "request from " NIPQUAD_FMT "/%u\n",
1369                                        NIPQUAD(saddr),
1370                                        ntohs(tcp_hdr(skb)->source));
1371                         goto drop_and_release;
1372                 }
1373
1374                 isn = tcp_v4_init_sequence(skb);
1375         }
1376         tcp_rsk(req)->snt_isn = isn;
1377
1378         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1379                 goto drop_and_free;
1380
1381         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1382         return 0;
1383
1384 drop_and_release:
1385         dst_release(dst);
1386 drop_and_free:
1387         reqsk_free(req);
1388 drop:
1389         return 0;
1390 }
1391
1392
1393 /*
1394  * The three way handshake has completed - we got a valid synack -
1395  * now create the new socket.
1396  */
1397 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1398                                   struct request_sock *req,
1399                                   struct dst_entry *dst)
1400 {
1401         struct inet_request_sock *ireq;
1402         struct inet_sock *newinet;
1403         struct tcp_sock *newtp;
1404         struct sock *newsk;
1405 #ifdef CONFIG_TCP_MD5SIG
1406         struct tcp_md5sig_key *key;
1407 #endif
1408
1409         if (sk_acceptq_is_full(sk))
1410                 goto exit_overflow;
1411
1412         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1413                 goto exit;
1414
1415         newsk = tcp_create_openreq_child(sk, req, skb);
1416         if (!newsk)
1417                 goto exit;
1418
1419         newsk->sk_gso_type = SKB_GSO_TCPV4;
1420         sk_setup_caps(newsk, dst);
1421
1422         newtp                 = tcp_sk(newsk);
1423         newinet               = inet_sk(newsk);
1424         ireq                  = inet_rsk(req);
1425         newinet->daddr        = ireq->rmt_addr;
1426         newinet->rcv_saddr    = ireq->loc_addr;
1427         newinet->saddr        = ireq->loc_addr;
1428         newinet->opt          = ireq->opt;
1429         ireq->opt             = NULL;
1430         newinet->mc_index     = inet_iif(skb);
1431         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1432         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1433         if (newinet->opt)
1434                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1435         newinet->id = newtp->write_seq ^ jiffies;
1436
1437         tcp_mtup_init(newsk);
1438         tcp_sync_mss(newsk, dst_mtu(dst));
1439         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1440         tcp_initialize_rcv_mss(newsk);
1441
1442 #ifdef CONFIG_TCP_MD5SIG
1443         /* Copy over the MD5 key from the original socket */
1444         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1445                 /*
1446                  * We're using one, so create a matching key
1447                  * on the newsk structure. If we fail to get
1448                  * memory, then we end up not copying the key
1449                  * across. Shucks.
1450                  */
1451                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1452                 if (newkey != NULL)
1453                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1454                                           newkey, key->keylen);
1455         }
1456 #endif
1457
1458         __inet_hash_nolisten(newsk);
1459         __inet_inherit_port(sk, newsk);
1460
1461         return newsk;
1462
1463 exit_overflow:
1464         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1465 exit:
1466         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1467         dst_release(dst);
1468         return NULL;
1469 }
1470
1471 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1472 {
1473         struct tcphdr *th = tcp_hdr(skb);
1474         const struct iphdr *iph = ip_hdr(skb);
1475         struct sock *nsk;
1476         struct request_sock **prev;
1477         /* Find possible connection requests. */
1478         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1479                                                        iph->saddr, iph->daddr);
1480         if (req)
1481                 return tcp_check_req(sk, skb, req, prev);
1482
1483         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1484                         th->source, iph->daddr, th->dest, inet_iif(skb));
1485
1486         if (nsk) {
1487                 if (nsk->sk_state != TCP_TIME_WAIT) {
1488                         bh_lock_sock(nsk);
1489                         return nsk;
1490                 }
1491                 inet_twsk_put(inet_twsk(nsk));
1492                 return NULL;
1493         }
1494
1495 #ifdef CONFIG_SYN_COOKIES
1496         if (!th->rst && !th->syn && th->ack)
1497                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1498 #endif
1499         return sk;
1500 }
1501
1502 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1503 {
1504         const struct iphdr *iph = ip_hdr(skb);
1505
1506         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1507                 if (!tcp_v4_check(skb->len, iph->saddr,
1508                                   iph->daddr, skb->csum)) {
1509                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1510                         return 0;
1511                 }
1512         }
1513
1514         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1515                                        skb->len, IPPROTO_TCP, 0);
1516
1517         if (skb->len <= 76) {
1518                 return __skb_checksum_complete(skb);
1519         }
1520         return 0;
1521 }
1522
1523
1524 /* The socket must have it's spinlock held when we get
1525  * here.
1526  *
1527  * We have a potential double-lock case here, so even when
1528  * doing backlog processing we use the BH locking scheme.
1529  * This is because we cannot sleep with the original spinlock
1530  * held.
1531  */
1532 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1533 {
1534         struct sock *rsk;
1535 #ifdef CONFIG_TCP_MD5SIG
1536         /*
1537          * We really want to reject the packet as early as possible
1538          * if:
1539          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1540          *  o There is an MD5 option and we're not expecting one
1541          */
1542         if (tcp_v4_inbound_md5_hash(sk, skb))
1543                 goto discard;
1544 #endif
1545
1546         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1547                 TCP_CHECK_TIMER(sk);
1548                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1549                         rsk = sk;
1550                         goto reset;
1551                 }
1552                 TCP_CHECK_TIMER(sk);
1553                 return 0;
1554         }
1555
1556         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1557                 goto csum_err;
1558
1559         if (sk->sk_state == TCP_LISTEN) {
1560                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1561                 if (!nsk)
1562                         goto discard;
1563
1564                 if (nsk != sk) {
1565                         if (tcp_child_process(sk, nsk, skb)) {
1566                                 rsk = nsk;
1567                                 goto reset;
1568                         }
1569                         return 0;
1570                 }
1571         }
1572
1573         TCP_CHECK_TIMER(sk);
1574         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1575                 rsk = sk;
1576                 goto reset;
1577         }
1578         TCP_CHECK_TIMER(sk);
1579         return 0;
1580
1581 reset:
1582         tcp_v4_send_reset(rsk, skb);
1583 discard:
1584         kfree_skb(skb);
1585         /* Be careful here. If this function gets more complicated and
1586          * gcc suffers from register pressure on the x86, sk (in %ebx)
1587          * might be destroyed here. This current version compiles correctly,
1588          * but you have been warned.
1589          */
1590         return 0;
1591
1592 csum_err:
1593         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1594         goto discard;
1595 }
1596
1597 /*
1598  *      From tcp_input.c
1599  */
1600
1601 int tcp_v4_rcv(struct sk_buff *skb)
1602 {
1603         const struct iphdr *iph;
1604         struct tcphdr *th;
1605         struct sock *sk;
1606         int ret;
1607
1608         if (skb->pkt_type != PACKET_HOST)
1609                 goto discard_it;
1610
1611         /* Count it even if it's bad */
1612         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1613
1614         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1615                 goto discard_it;
1616
1617         th = tcp_hdr(skb);
1618
1619         if (th->doff < sizeof(struct tcphdr) / 4)
1620                 goto bad_packet;
1621         if (!pskb_may_pull(skb, th->doff * 4))
1622                 goto discard_it;
1623
1624         /* An explanation is required here, I think.
1625          * Packet length and doff are validated by header prediction,
1626          * provided case of th->doff==0 is eliminated.
1627          * So, we defer the checks. */
1628         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1629                 goto bad_packet;
1630
1631         th = tcp_hdr(skb);
1632         iph = ip_hdr(skb);
1633         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1634         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1635                                     skb->len - th->doff * 4);
1636         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1637         TCP_SKB_CB(skb)->when    = 0;
1638         TCP_SKB_CB(skb)->flags   = iph->tos;
1639         TCP_SKB_CB(skb)->sacked  = 0;
1640
1641         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1642                         th->source, iph->daddr, th->dest, inet_iif(skb));
1643         if (!sk)
1644                 goto no_tcp_socket;
1645
1646 process:
1647         if (sk->sk_state == TCP_TIME_WAIT)
1648                 goto do_time_wait;
1649
1650         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1651                 goto discard_and_relse;
1652         nf_reset(skb);
1653
1654         if (sk_filter(sk, skb))
1655                 goto discard_and_relse;
1656
1657         skb->dev = NULL;
1658
1659         bh_lock_sock_nested(sk);
1660         ret = 0;
1661         if (!sock_owned_by_user(sk)) {
1662 #ifdef CONFIG_NET_DMA
1663                 struct tcp_sock *tp = tcp_sk(sk);
1664                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1665                         tp->ucopy.dma_chan = get_softnet_dma();
1666                 if (tp->ucopy.dma_chan)
1667                         ret = tcp_v4_do_rcv(sk, skb);
1668                 else
1669 #endif
1670                 {
1671                         if (!tcp_prequeue(sk, skb))
1672                         ret = tcp_v4_do_rcv(sk, skb);
1673                 }
1674         } else
1675                 sk_add_backlog(sk, skb);
1676         bh_unlock_sock(sk);
1677
1678         sock_put(sk);
1679
1680         return ret;
1681
1682 no_tcp_socket:
1683         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1684                 goto discard_it;
1685
1686         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1687 bad_packet:
1688                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1689         } else {
1690                 tcp_v4_send_reset(NULL, skb);
1691         }
1692
1693 discard_it:
1694         /* Discard frame. */
1695         kfree_skb(skb);
1696         return 0;
1697
1698 discard_and_relse:
1699         sock_put(sk);
1700         goto discard_it;
1701
1702 do_time_wait:
1703         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1704                 inet_twsk_put(inet_twsk(sk));
1705                 goto discard_it;
1706         }
1707
1708         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1709                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1710                 inet_twsk_put(inet_twsk(sk));
1711                 goto discard_it;
1712         }
1713         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1714         case TCP_TW_SYN: {
1715                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1716                                                         &tcp_hashinfo,
1717                                                         iph->daddr, th->dest,
1718                                                         inet_iif(skb));
1719                 if (sk2) {
1720                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1721                         inet_twsk_put(inet_twsk(sk));
1722                         sk = sk2;
1723                         goto process;
1724                 }
1725                 /* Fall through to ACK */
1726         }
1727         case TCP_TW_ACK:
1728                 tcp_v4_timewait_ack(sk, skb);
1729                 break;
1730         case TCP_TW_RST:
1731                 goto no_tcp_socket;
1732         case TCP_TW_SUCCESS:;
1733         }
1734         goto discard_it;
1735 }
1736
1737 /* VJ's idea. Save last timestamp seen from this destination
1738  * and hold it at least for normal timewait interval to use for duplicate
1739  * segment detection in subsequent connections, before they enter synchronized
1740  * state.
1741  */
1742
1743 int tcp_v4_remember_stamp(struct sock *sk)
1744 {
1745         struct inet_sock *inet = inet_sk(sk);
1746         struct tcp_sock *tp = tcp_sk(sk);
1747         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1748         struct inet_peer *peer = NULL;
1749         int release_it = 0;
1750
1751         if (!rt || rt->rt_dst != inet->daddr) {
1752                 peer = inet_getpeer(inet->daddr, 1);
1753                 release_it = 1;
1754         } else {
1755                 if (!rt->peer)
1756                         rt_bind_peer(rt, 1);
1757                 peer = rt->peer;
1758         }
1759
1760         if (peer) {
1761                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1762                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1763                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1764                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1765                         peer->tcp_ts = tp->rx_opt.ts_recent;
1766                 }
1767                 if (release_it)
1768                         inet_putpeer(peer);
1769                 return 1;
1770         }
1771
1772         return 0;
1773 }
1774
1775 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1776 {
1777         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1778
1779         if (peer) {
1780                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1781
1782                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1783                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1784                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1785                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1786                         peer->tcp_ts       = tcptw->tw_ts_recent;
1787                 }
1788                 inet_putpeer(peer);
1789                 return 1;
1790         }
1791
1792         return 0;
1793 }
1794
1795 struct inet_connection_sock_af_ops ipv4_specific = {
1796         .queue_xmit        = ip_queue_xmit,
1797         .send_check        = tcp_v4_send_check,
1798         .rebuild_header    = inet_sk_rebuild_header,
1799         .conn_request      = tcp_v4_conn_request,
1800         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1801         .remember_stamp    = tcp_v4_remember_stamp,
1802         .net_header_len    = sizeof(struct iphdr),
1803         .setsockopt        = ip_setsockopt,
1804         .getsockopt        = ip_getsockopt,
1805         .addr2sockaddr     = inet_csk_addr2sockaddr,
1806         .sockaddr_len      = sizeof(struct sockaddr_in),
1807         .bind_conflict     = inet_csk_bind_conflict,
1808 #ifdef CONFIG_COMPAT
1809         .compat_setsockopt = compat_ip_setsockopt,
1810         .compat_getsockopt = compat_ip_getsockopt,
1811 #endif
1812 };
1813
1814 #ifdef CONFIG_TCP_MD5SIG
1815 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1816         .md5_lookup             = tcp_v4_md5_lookup,
1817         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1818         .md5_add                = tcp_v4_md5_add_func,
1819         .md5_parse              = tcp_v4_parse_md5_keys,
1820 };
1821 #endif
1822
1823 /* NOTE: A lot of things set to zero explicitly by call to
1824  *       sk_alloc() so need not be done here.
1825  */
1826 static int tcp_v4_init_sock(struct sock *sk)
1827 {
1828         struct inet_connection_sock *icsk = inet_csk(sk);
1829         struct tcp_sock *tp = tcp_sk(sk);
1830
1831         skb_queue_head_init(&tp->out_of_order_queue);
1832         tcp_init_xmit_timers(sk);
1833         tcp_prequeue_init(tp);
1834
1835         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1836         tp->mdev = TCP_TIMEOUT_INIT;
1837
1838         /* So many TCP implementations out there (incorrectly) count the
1839          * initial SYN frame in their delayed-ACK and congestion control
1840          * algorithms that we must have the following bandaid to talk
1841          * efficiently to them.  -DaveM
1842          */
1843         tp->snd_cwnd = 2;
1844
1845         /* See draft-stevens-tcpca-spec-01 for discussion of the
1846          * initialization of these values.
1847          */
1848         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1849         tp->snd_cwnd_clamp = ~0;
1850         tp->mss_cache = 536;
1851
1852         tp->reordering = sysctl_tcp_reordering;
1853         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1854
1855         sk->sk_state = TCP_CLOSE;
1856
1857         sk->sk_write_space = sk_stream_write_space;
1858         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1859
1860         icsk->icsk_af_ops = &ipv4_specific;
1861         icsk->icsk_sync_mss = tcp_sync_mss;
1862 #ifdef CONFIG_TCP_MD5SIG
1863         tp->af_specific = &tcp_sock_ipv4_specific;
1864 #endif
1865
1866         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1867         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1868
1869         atomic_inc(&tcp_sockets_allocated);
1870
1871         return 0;
1872 }
1873
1874 int tcp_v4_destroy_sock(struct sock *sk)
1875 {
1876         struct tcp_sock *tp = tcp_sk(sk);
1877
1878         tcp_clear_xmit_timers(sk);
1879
1880         tcp_cleanup_congestion_control(sk);
1881
1882         /* Cleanup up the write buffer. */
1883         tcp_write_queue_purge(sk);
1884
1885         /* Cleans up our, hopefully empty, out_of_order_queue. */
1886         __skb_queue_purge(&tp->out_of_order_queue);
1887
1888 #ifdef CONFIG_TCP_MD5SIG
1889         /* Clean up the MD5 key list, if any */
1890         if (tp->md5sig_info) {
1891                 tcp_v4_clear_md5_list(sk);
1892                 kfree(tp->md5sig_info);
1893                 tp->md5sig_info = NULL;
1894         }
1895 #endif
1896
1897 #ifdef CONFIG_NET_DMA
1898         /* Cleans up our sk_async_wait_queue */
1899         __skb_queue_purge(&sk->sk_async_wait_queue);
1900 #endif
1901
1902         /* Clean prequeue, it must be empty really */
1903         __skb_queue_purge(&tp->ucopy.prequeue);
1904
1905         /* Clean up a referenced TCP bind bucket. */
1906         if (inet_csk(sk)->icsk_bind_hash)
1907                 inet_put_port(sk);
1908
1909         /*
1910          * If sendmsg cached page exists, toss it.
1911          */
1912         if (sk->sk_sndmsg_page) {
1913                 __free_page(sk->sk_sndmsg_page);
1914                 sk->sk_sndmsg_page = NULL;
1915         }
1916
1917         atomic_dec(&tcp_sockets_allocated);
1918
1919         return 0;
1920 }
1921
1922 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1923
1924 #ifdef CONFIG_PROC_FS
1925 /* Proc filesystem TCP sock list dumping. */
1926
1927 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1928 {
1929         return hlist_empty(head) ? NULL :
1930                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1931 }
1932
1933 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1934 {
1935         return tw->tw_node.next ?
1936                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1937 }
1938
1939 static void *listening_get_next(struct seq_file *seq, void *cur)
1940 {
1941         struct inet_connection_sock *icsk;
1942         struct hlist_node *node;
1943         struct sock *sk = cur;
1944         struct tcp_iter_state* st = seq->private;
1945         struct net *net = seq_file_net(seq);
1946
1947         if (!sk) {
1948                 st->bucket = 0;
1949                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1950                 goto get_sk;
1951         }
1952
1953         ++st->num;
1954
1955         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1956                 struct request_sock *req = cur;
1957
1958                 icsk = inet_csk(st->syn_wait_sk);
1959                 req = req->dl_next;
1960                 while (1) {
1961                         while (req) {
1962                                 if (req->rsk_ops->family == st->family &&
1963                                     net_eq(sock_net(req->sk), net)) {
1964                                         cur = req;
1965                                         goto out;
1966                                 }
1967                                 req = req->dl_next;
1968                         }
1969                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1970                                 break;
1971 get_req:
1972                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1973                 }
1974                 sk        = sk_next(st->syn_wait_sk);
1975                 st->state = TCP_SEQ_STATE_LISTENING;
1976                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1977         } else {
1978                 icsk = inet_csk(sk);
1979                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1980                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1981                         goto start_req;
1982                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1983                 sk = sk_next(sk);
1984         }
1985 get_sk:
1986         sk_for_each_from(sk, node) {
1987                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1988                         cur = sk;
1989                         goto out;
1990                 }
1991                 icsk = inet_csk(sk);
1992                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1993                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1994 start_req:
1995                         st->uid         = sock_i_uid(sk);
1996                         st->syn_wait_sk = sk;
1997                         st->state       = TCP_SEQ_STATE_OPENREQ;
1998                         st->sbucket     = 0;
1999                         goto get_req;
2000                 }
2001                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2002         }
2003         if (++st->bucket < INET_LHTABLE_SIZE) {
2004                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2005                 goto get_sk;
2006         }
2007         cur = NULL;
2008 out:
2009         return cur;
2010 }
2011
2012 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2013 {
2014         void *rc = listening_get_next(seq, NULL);
2015
2016         while (rc && *pos) {
2017                 rc = listening_get_next(seq, rc);
2018                 --*pos;
2019         }
2020         return rc;
2021 }
2022
2023 static void *established_get_first(struct seq_file *seq)
2024 {
2025         struct tcp_iter_state* st = seq->private;
2026         struct net *net = seq_file_net(seq);
2027         void *rc = NULL;
2028
2029         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2030                 struct sock *sk;
2031                 struct hlist_node *node;
2032                 struct inet_timewait_sock *tw;
2033                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2034
2035                 read_lock_bh(lock);
2036                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2037                         if (sk->sk_family != st->family ||
2038                             !net_eq(sock_net(sk), net)) {
2039                                 continue;
2040                         }
2041                         rc = sk;
2042                         goto out;
2043                 }
2044                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2045                 inet_twsk_for_each(tw, node,
2046                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2047                         if (tw->tw_family != st->family ||
2048                             !net_eq(twsk_net(tw), net)) {
2049                                 continue;
2050                         }
2051                         rc = tw;
2052                         goto out;
2053                 }
2054                 read_unlock_bh(lock);
2055                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2056         }
2057 out:
2058         return rc;
2059 }
2060
2061 static void *established_get_next(struct seq_file *seq, void *cur)
2062 {
2063         struct sock *sk = cur;
2064         struct inet_timewait_sock *tw;
2065         struct hlist_node *node;
2066         struct tcp_iter_state* st = seq->private;
2067         struct net *net = seq_file_net(seq);
2068
2069         ++st->num;
2070
2071         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2072                 tw = cur;
2073                 tw = tw_next(tw);
2074 get_tw:
2075                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2076                         tw = tw_next(tw);
2077                 }
2078                 if (tw) {
2079                         cur = tw;
2080                         goto out;
2081                 }
2082                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2083                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2084
2085                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2086                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2087                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2088                 } else {
2089                         cur = NULL;
2090                         goto out;
2091                 }
2092         } else
2093                 sk = sk_next(sk);
2094
2095         sk_for_each_from(sk, node) {
2096                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2097                         goto found;
2098         }
2099
2100         st->state = TCP_SEQ_STATE_TIME_WAIT;
2101         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2102         goto get_tw;
2103 found:
2104         cur = sk;
2105 out:
2106         return cur;
2107 }
2108
2109 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2110 {
2111         void *rc = established_get_first(seq);
2112
2113         while (rc && pos) {
2114                 rc = established_get_next(seq, rc);
2115                 --pos;
2116         }
2117         return rc;
2118 }
2119
2120 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2121 {
2122         void *rc;
2123         struct tcp_iter_state* st = seq->private;
2124
2125         inet_listen_lock(&tcp_hashinfo);
2126         st->state = TCP_SEQ_STATE_LISTENING;
2127         rc        = listening_get_idx(seq, &pos);
2128
2129         if (!rc) {
2130                 inet_listen_unlock(&tcp_hashinfo);
2131                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2132                 rc        = established_get_idx(seq, pos);
2133         }
2134
2135         return rc;
2136 }
2137
2138 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2139 {
2140         struct tcp_iter_state* st = seq->private;
2141         st->state = TCP_SEQ_STATE_LISTENING;
2142         st->num = 0;
2143         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2144 }
2145
2146 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2147 {
2148         void *rc = NULL;
2149         struct tcp_iter_state* st;
2150
2151         if (v == SEQ_START_TOKEN) {
2152                 rc = tcp_get_idx(seq, 0);
2153                 goto out;
2154         }
2155         st = seq->private;
2156
2157         switch (st->state) {
2158         case TCP_SEQ_STATE_OPENREQ:
2159         case TCP_SEQ_STATE_LISTENING:
2160                 rc = listening_get_next(seq, v);
2161                 if (!rc) {
2162                         inet_listen_unlock(&tcp_hashinfo);
2163                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2164                         rc        = established_get_first(seq);
2165                 }
2166                 break;
2167         case TCP_SEQ_STATE_ESTABLISHED:
2168         case TCP_SEQ_STATE_TIME_WAIT:
2169                 rc = established_get_next(seq, v);
2170                 break;
2171         }
2172 out:
2173         ++*pos;
2174         return rc;
2175 }
2176
2177 static void tcp_seq_stop(struct seq_file *seq, void *v)
2178 {
2179         struct tcp_iter_state* st = seq->private;
2180
2181         switch (st->state) {
2182         case TCP_SEQ_STATE_OPENREQ:
2183                 if (v) {
2184                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2185                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2186                 }
2187         case TCP_SEQ_STATE_LISTENING:
2188                 if (v != SEQ_START_TOKEN)
2189                         inet_listen_unlock(&tcp_hashinfo);
2190                 break;
2191         case TCP_SEQ_STATE_TIME_WAIT:
2192         case TCP_SEQ_STATE_ESTABLISHED:
2193                 if (v)
2194                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2195                 break;
2196         }
2197 }
2198
2199 static int tcp_seq_open(struct inode *inode, struct file *file)
2200 {
2201         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2202         struct tcp_iter_state *s;
2203         int err;
2204
2205         err = seq_open_net(inode, file, &afinfo->seq_ops,
2206                           sizeof(struct tcp_iter_state));
2207         if (err < 0)
2208                 return err;
2209
2210         s = ((struct seq_file *)file->private_data)->private;
2211         s->family               = afinfo->family;
2212         return 0;
2213 }
2214
2215 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2216 {
2217         int rc = 0;
2218         struct proc_dir_entry *p;
2219
2220         afinfo->seq_fops.open           = tcp_seq_open;
2221         afinfo->seq_fops.read           = seq_read;
2222         afinfo->seq_fops.llseek         = seq_lseek;
2223         afinfo->seq_fops.release        = seq_release_net;
2224
2225         afinfo->seq_ops.start           = tcp_seq_start;
2226         afinfo->seq_ops.next            = tcp_seq_next;
2227         afinfo->seq_ops.stop            = tcp_seq_stop;
2228
2229         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2230                              &afinfo->seq_fops, afinfo);
2231         if (!p)
2232                 rc = -ENOMEM;
2233         return rc;
2234 }
2235
2236 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2237 {
2238         proc_net_remove(net, afinfo->name);
2239 }
2240
2241 static void get_openreq4(struct sock *sk, struct request_sock *req,
2242                          struct seq_file *f, int i, int uid, int *len)
2243 {
2244         const struct inet_request_sock *ireq = inet_rsk(req);
2245         int ttd = req->expires - jiffies;
2246
2247         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2248                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2249                 i,
2250                 ireq->loc_addr,
2251                 ntohs(inet_sk(sk)->sport),
2252                 ireq->rmt_addr,
2253                 ntohs(ireq->rmt_port),
2254                 TCP_SYN_RECV,
2255                 0, 0, /* could print option size, but that is af dependent. */
2256                 1,    /* timers active (only the expire timer) */
2257                 jiffies_to_clock_t(ttd),
2258                 req->retrans,
2259                 uid,
2260                 0,  /* non standard timer */
2261                 0, /* open_requests have no inode */
2262                 atomic_read(&sk->sk_refcnt),
2263                 req,
2264                 len);
2265 }
2266
2267 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2268 {
2269         int timer_active;
2270         unsigned long timer_expires;
2271         struct tcp_sock *tp = tcp_sk(sk);
2272         const struct inet_connection_sock *icsk = inet_csk(sk);
2273         struct inet_sock *inet = inet_sk(sk);
2274         __be32 dest = inet->daddr;
2275         __be32 src = inet->rcv_saddr;
2276         __u16 destp = ntohs(inet->dport);
2277         __u16 srcp = ntohs(inet->sport);
2278
2279         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2280                 timer_active    = 1;
2281                 timer_expires   = icsk->icsk_timeout;
2282         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2283                 timer_active    = 4;
2284                 timer_expires   = icsk->icsk_timeout;
2285         } else if (timer_pending(&sk->sk_timer)) {
2286                 timer_active    = 2;
2287                 timer_expires   = sk->sk_timer.expires;
2288         } else {
2289                 timer_active    = 0;
2290                 timer_expires = jiffies;
2291         }
2292
2293         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2294                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2295                 i, src, srcp, dest, destp, sk->sk_state,
2296                 tp->write_seq - tp->snd_una,
2297                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2298                                              (tp->rcv_nxt - tp->copied_seq),
2299                 timer_active,
2300                 jiffies_to_clock_t(timer_expires - jiffies),
2301                 icsk->icsk_retransmits,
2302                 sock_i_uid(sk),
2303                 icsk->icsk_probes_out,
2304                 sock_i_ino(sk),
2305                 atomic_read(&sk->sk_refcnt), sk,
2306                 jiffies_to_clock_t(icsk->icsk_rto),
2307                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2308                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2309                 tp->snd_cwnd,
2310                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2311                 len);
2312 }
2313
2314 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2315                                struct seq_file *f, int i, int *len)
2316 {
2317         __be32 dest, src;
2318         __u16 destp, srcp;
2319         int ttd = tw->tw_ttd - jiffies;
2320
2321         if (ttd < 0)
2322                 ttd = 0;
2323
2324         dest  = tw->tw_daddr;
2325         src   = tw->tw_rcv_saddr;
2326         destp = ntohs(tw->tw_dport);
2327         srcp  = ntohs(tw->tw_sport);
2328
2329         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2330                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2331                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2332                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2333                 atomic_read(&tw->tw_refcnt), tw, len);
2334 }
2335
2336 #define TMPSZ 150
2337
2338 static int tcp4_seq_show(struct seq_file *seq, void *v)
2339 {
2340         struct tcp_iter_state* st;
2341         int len;
2342
2343         if (v == SEQ_START_TOKEN) {
2344                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2345                            "  sl  local_address rem_address   st tx_queue "
2346                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2347                            "inode");
2348                 goto out;
2349         }
2350         st = seq->private;
2351
2352         switch (st->state) {
2353         case TCP_SEQ_STATE_LISTENING:
2354         case TCP_SEQ_STATE_ESTABLISHED:
2355                 get_tcp4_sock(v, seq, st->num, &len);
2356                 break;
2357         case TCP_SEQ_STATE_OPENREQ:
2358                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2359                 break;
2360         case TCP_SEQ_STATE_TIME_WAIT:
2361                 get_timewait4_sock(v, seq, st->num, &len);
2362                 break;
2363         }
2364         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2365 out:
2366         return 0;
2367 }
2368
2369 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2370         .name           = "tcp",
2371         .family         = AF_INET,
2372         .seq_fops       = {
2373                 .owner          = THIS_MODULE,
2374         },
2375         .seq_ops        = {
2376                 .show           = tcp4_seq_show,
2377         },
2378 };
2379
2380 static int tcp4_proc_init_net(struct net *net)
2381 {
2382         return tcp_proc_register(net, &tcp4_seq_afinfo);
2383 }
2384
2385 static void tcp4_proc_exit_net(struct net *net)
2386 {
2387         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2388 }
2389
2390 static struct pernet_operations tcp4_net_ops = {
2391         .init = tcp4_proc_init_net,
2392         .exit = tcp4_proc_exit_net,
2393 };
2394
2395 int __init tcp4_proc_init(void)
2396 {
2397         return register_pernet_subsys(&tcp4_net_ops);
2398 }
2399
2400 void tcp4_proc_exit(void)
2401 {
2402         unregister_pernet_subsys(&tcp4_net_ops);
2403 }
2404 #endif /* CONFIG_PROC_FS */
2405
2406 struct proto tcp_prot = {
2407         .name                   = "TCP",
2408         .owner                  = THIS_MODULE,
2409         .close                  = tcp_close,
2410         .connect                = tcp_v4_connect,
2411         .disconnect             = tcp_disconnect,
2412         .accept                 = inet_csk_accept,
2413         .ioctl                  = tcp_ioctl,
2414         .init                   = tcp_v4_init_sock,
2415         .destroy                = tcp_v4_destroy_sock,
2416         .shutdown               = tcp_shutdown,
2417         .setsockopt             = tcp_setsockopt,
2418         .getsockopt             = tcp_getsockopt,
2419         .recvmsg                = tcp_recvmsg,
2420         .backlog_rcv            = tcp_v4_do_rcv,
2421         .hash                   = inet_hash,
2422         .unhash                 = inet_unhash,
2423         .get_port               = inet_csk_get_port,
2424         .enter_memory_pressure  = tcp_enter_memory_pressure,
2425         .sockets_allocated      = &tcp_sockets_allocated,
2426         .orphan_count           = &tcp_orphan_count,
2427         .memory_allocated       = &tcp_memory_allocated,
2428         .memory_pressure        = &tcp_memory_pressure,
2429         .sysctl_mem             = sysctl_tcp_mem,
2430         .sysctl_wmem            = sysctl_tcp_wmem,
2431         .sysctl_rmem            = sysctl_tcp_rmem,
2432         .max_header             = MAX_TCP_HEADER,
2433         .obj_size               = sizeof(struct tcp_sock),
2434         .twsk_prot              = &tcp_timewait_sock_ops,
2435         .rsk_prot               = &tcp_request_sock_ops,
2436         .h.hashinfo             = &tcp_hashinfo,
2437 #ifdef CONFIG_COMPAT
2438         .compat_setsockopt      = compat_tcp_setsockopt,
2439         .compat_getsockopt      = compat_tcp_getsockopt,
2440 #endif
2441 };
2442
2443
2444 static int __net_init tcp_sk_init(struct net *net)
2445 {
2446         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2447                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2448 }
2449
2450 static void __net_exit tcp_sk_exit(struct net *net)
2451 {
2452         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2453 }
2454
2455 static struct pernet_operations __net_initdata tcp_sk_ops = {
2456        .init = tcp_sk_init,
2457        .exit = tcp_sk_exit,
2458 };
2459
2460 void __init tcp_v4_init(void)
2461 {
2462         if (register_pernet_device(&tcp_sk_ops))
2463                 panic("Failed to create the TCP control socket.\n");
2464 }
2465
2466 EXPORT_SYMBOL(ipv4_specific);
2467 EXPORT_SYMBOL(tcp_hashinfo);
2468 EXPORT_SYMBOL(tcp_prot);
2469 EXPORT_SYMBOL(tcp_v4_conn_request);
2470 EXPORT_SYMBOL(tcp_v4_connect);
2471 EXPORT_SYMBOL(tcp_v4_do_rcv);
2472 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2473 EXPORT_SYMBOL(tcp_v4_send_check);
2474 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2475
2476 #ifdef CONFIG_PROC_FS
2477 EXPORT_SYMBOL(tcp_proc_register);
2478 EXPORT_SYMBOL(tcp_proc_unregister);
2479 #endif
2480 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2481