[PATCH] struct path: convert atm
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(skb->nh.iph->daddr,
129                                           skb->nh.iph->saddr,
130                                           skb->h.th->dest,
131                                           skb->h.th->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk);
195         if (tmp < 0)
196                 return tmp;
197
198         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199                 ip_rt_put(rt);
200                 return -ENETUNREACH;
201         }
202
203         if (!inet->opt || !inet->opt->srr)
204                 daddr = rt->rt_dst;
205
206         if (!inet->saddr)
207                 inet->saddr = rt->rt_src;
208         inet->rcv_saddr = inet->saddr;
209
210         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211                 /* Reset inherited state */
212                 tp->rx_opt.ts_recent       = 0;
213                 tp->rx_opt.ts_recent_stamp = 0;
214                 tp->write_seq              = 0;
215         }
216
217         if (tcp_death_row.sysctl_tw_recycle &&
218             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219                 struct inet_peer *peer = rt_get_peer(rt);
220                 /*
221                  * VJ's idea. We save last timestamp seen from
222                  * the destination in peer table, when entering state
223                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224                  * when trying new connection.
225                  */
226                 if (peer != NULL &&
227                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
228                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229                         tp->rx_opt.ts_recent = peer->tcp_ts;
230                 }
231         }
232
233         inet->dport = usin->sin_port;
234         inet->daddr = daddr;
235
236         inet_csk(sk)->icsk_ext_hdr_len = 0;
237         if (inet->opt)
238                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240         tp->rx_opt.mss_clamp = 536;
241
242         /* Socket identity is still unknown (sport may be zero).
243          * However we set state to SYN-SENT and not releasing socket
244          * lock select source port, enter ourselves into the hash tables and
245          * complete initialization after this.
246          */
247         tcp_set_state(sk, TCP_SYN_SENT);
248         err = inet_hash_connect(&tcp_death_row, sk);
249         if (err)
250                 goto failure;
251
252         err = ip_route_newports(&rt, IPPROTO_TCP,
253                                 inet->sport, inet->dport, sk);
254         if (err)
255                 goto failure;
256
257         /* OK, now commit destination to socket.  */
258         sk->sk_gso_type = SKB_GSO_TCPV4;
259         sk_setup_caps(sk, &rt->u.dst);
260
261         if (!tp->write_seq)
262                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263                                                            inet->daddr,
264                                                            inet->sport,
265                                                            usin->sin_port);
266
267         inet->id = tp->write_seq ^ jiffies;
268
269         err = tcp_connect(sk);
270         rt = NULL;
271         if (err)
272                 goto failure;
273
274         return 0;
275
276 failure:
277         /*
278          * This unhashes the socket and releases the local port,
279          * if necessary.
280          */
281         tcp_set_state(sk, TCP_CLOSE);
282         ip_rt_put(rt);
283         sk->sk_route_caps = 0;
284         inet->dport = 0;
285         return err;
286 }
287
288 /*
289  * This routine does path mtu discovery as defined in RFC1191.
290  */
291 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292 {
293         struct dst_entry *dst;
294         struct inet_sock *inet = inet_sk(sk);
295
296         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297          * send out by Linux are always <576bytes so they should go through
298          * unfragmented).
299          */
300         if (sk->sk_state == TCP_LISTEN)
301                 return;
302
303         /* We don't check in the destentry if pmtu discovery is forbidden
304          * on this route. We just assume that no packet_to_big packets
305          * are send back when pmtu discovery is not active.
306          * There is a small race when the user changes this flag in the
307          * route, but I think that's acceptable.
308          */
309         if ((dst = __sk_dst_check(sk, 0)) == NULL)
310                 return;
311
312         dst->ops->update_pmtu(dst, mtu);
313
314         /* Something is about to be wrong... Remember soft error
315          * for the case, if this connection will not able to recover.
316          */
317         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318                 sk->sk_err_soft = EMSGSIZE;
319
320         mtu = dst_mtu(dst);
321
322         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324                 tcp_sync_mss(sk, mtu);
325
326                 /* Resend the TCP packet because it's
327                  * clear that the old packet has been
328                  * dropped. This is the new "fast" path mtu
329                  * discovery.
330                  */
331                 tcp_simple_retransmit(sk);
332         } /* else let the usual retransmit timer handle it */
333 }
334
335 /*
336  * This routine is called by the ICMP module when it gets some
337  * sort of error condition.  If err < 0 then the socket should
338  * be closed and the error returned to the user.  If err > 0
339  * it's just the icmp type << 8 | icmp code.  After adjustment
340  * header points to the first 8 bytes of the tcp header.  We need
341  * to find the appropriate port.
342  *
343  * The locking strategy used here is very "optimistic". When
344  * someone else accesses the socket the ICMP is just dropped
345  * and for some paths there is no check at all.
346  * A more general error queue to queue errors for later handling
347  * is probably better.
348  *
349  */
350
351 void tcp_v4_err(struct sk_buff *skb, u32 info)
352 {
353         struct iphdr *iph = (struct iphdr *)skb->data;
354         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355         struct tcp_sock *tp;
356         struct inet_sock *inet;
357         int type = skb->h.icmph->type;
358         int code = skb->h.icmph->code;
359         struct sock *sk;
360         __u32 seq;
361         int err;
362
363         if (skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369                          th->source, inet_iif(skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 break;
416         case ICMP_TIME_EXCEEDED:
417                 err = EHOSTUNREACH;
418                 break;
419         default:
420                 goto out;
421         }
422
423         switch (sk->sk_state) {
424                 struct request_sock *req, **prev;
425         case TCP_LISTEN:
426                 if (sock_owned_by_user(sk))
427                         goto out;
428
429                 req = inet_csk_search_req(sk, &prev, th->dest,
430                                           iph->daddr, iph->saddr);
431                 if (!req)
432                         goto out;
433
434                 /* ICMPs are not backlogged, hence we cannot get
435                    an established socket here.
436                  */
437                 BUG_TRAP(!req->sk);
438
439                 if (seq != tcp_rsk(req)->snt_isn) {
440                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441                         goto out;
442                 }
443
444                 /*
445                  * Still in SYN_RECV, just remove it silently.
446                  * There is no good way to pass the error to the newly
447                  * created socket, and POSIX does not want network
448                  * errors returned from accept().
449                  */
450                 inet_csk_reqsk_queue_drop(sk, req, prev);
451                 goto out;
452
453         case TCP_SYN_SENT:
454         case TCP_SYN_RECV:  /* Cannot happen.
455                                It can f.e. if SYNs crossed.
456                              */
457                 if (!sock_owned_by_user(sk)) {
458                         sk->sk_err = err;
459
460                         sk->sk_error_report(sk);
461
462                         tcp_done(sk);
463                 } else {
464                         sk->sk_err_soft = err;
465                 }
466                 goto out;
467         }
468
469         /* If we've already connected we will keep trying
470          * until we time out, or the user gives up.
471          *
472          * rfc1122 4.2.3.9 allows to consider as hard errors
473          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474          * but it is obsoleted by pmtu discovery).
475          *
476          * Note, that in modern internet, where routing is unreliable
477          * and in each dark corner broken firewalls sit, sending random
478          * errors ordered by their masters even this two messages finally lose
479          * their original sense (even Linux sends invalid PORT_UNREACHs)
480          *
481          * Now we are in compliance with RFCs.
482          *                                                      --ANK (980905)
483          */
484
485         inet = inet_sk(sk);
486         if (!sock_owned_by_user(sk) && inet->recverr) {
487                 sk->sk_err = err;
488                 sk->sk_error_report(sk);
489         } else  { /* Only an error on timeout */
490                 sk->sk_err_soft = err;
491         }
492
493 out:
494         bh_unlock_sock(sk);
495         sock_put(sk);
496 }
497
498 /* This routine computes an IPv4 TCP checksum. */
499 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500 {
501         struct inet_sock *inet = inet_sk(sk);
502         struct tcphdr *th = skb->h.th;
503
504         if (skb->ip_summed == CHECKSUM_PARTIAL) {
505                 th->check = ~tcp_v4_check(th, len,
506                                           inet->saddr, inet->daddr, 0);
507                 skb->csum_offset = offsetof(struct tcphdr, check);
508         } else {
509                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
510                                          csum_partial((char *)th,
511                                                       th->doff << 2,
512                                                       skb->csum));
513         }
514 }
515
516 int tcp_v4_gso_send_check(struct sk_buff *skb)
517 {
518         struct iphdr *iph;
519         struct tcphdr *th;
520
521         if (!pskb_may_pull(skb, sizeof(*th)))
522                 return -EINVAL;
523
524         iph = skb->nh.iph;
525         th = skb->h.th;
526
527         th->check = 0;
528         th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
529         skb->csum_offset = offsetof(struct tcphdr, check);
530         skb->ip_summed = CHECKSUM_PARTIAL;
531         return 0;
532 }
533
534 /*
535  *      This routine will send an RST to the other tcp.
536  *
537  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
538  *                    for reset.
539  *      Answer: if a packet caused RST, it is not for a socket
540  *              existing in our system, if it is matched to a socket,
541  *              it is just duplicate segment or bug in other side's TCP.
542  *              So that we build reply only basing on parameters
543  *              arrived with segment.
544  *      Exception: precedence violation. We do not implement it in any case.
545  */
546
547 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
548 {
549         struct tcphdr *th = skb->h.th;
550         struct {
551                 struct tcphdr th;
552 #ifdef CONFIG_TCP_MD5SIG
553                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
554 #endif
555         } rep;
556         struct ip_reply_arg arg;
557 #ifdef CONFIG_TCP_MD5SIG
558         struct tcp_md5sig_key *key;
559 #endif
560
561         /* Never send a reset in response to a reset. */
562         if (th->rst)
563                 return;
564
565         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
566                 return;
567
568         /* Swap the send and the receive. */
569         memset(&rep, 0, sizeof(rep));
570         rep.th.dest   = th->source;
571         rep.th.source = th->dest;
572         rep.th.doff   = sizeof(struct tcphdr) / 4;
573         rep.th.rst    = 1;
574
575         if (th->ack) {
576                 rep.th.seq = th->ack_seq;
577         } else {
578                 rep.th.ack = 1;
579                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
580                                        skb->len - (th->doff << 2));
581         }
582
583         memset(&arg, 0, sizeof(arg));
584         arg.iov[0].iov_base = (unsigned char *)&rep;
585         arg.iov[0].iov_len  = sizeof(rep.th);
586
587 #ifdef CONFIG_TCP_MD5SIG
588         key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL;
589         if (key) {
590                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591                                    (TCPOPT_NOP << 16) |
592                                    (TCPOPT_MD5SIG << 8) |
593                                    TCPOLEN_MD5SIG);
594                 /* Update length and the length the header thinks exists */
595                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
596                 rep.th.doff = arg.iov[0].iov_len / 4;
597
598                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599                                         key,
600                                         skb->nh.iph->daddr,
601                                         skb->nh.iph->saddr,
602                                         &rep.th, IPPROTO_TCP,
603                                         arg.iov[0].iov_len);
604         }
605 #endif
606         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
607                                       skb->nh.iph->saddr, /* XXX */
608                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
609         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610
611         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
612
613         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
614         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
615 }
616
617 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
618    outside socket context is ugly, certainly. What can I do?
619  */
620
621 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622                             struct sk_buff *skb, u32 seq, u32 ack,
623                             u32 win, u32 ts)
624 {
625         struct tcphdr *th = skb->h.th;
626         struct {
627                 struct tcphdr th;
628                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
629 #ifdef CONFIG_TCP_MD5SIG
630                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
631 #endif
632                         ];
633         } rep;
634         struct ip_reply_arg arg;
635 #ifdef CONFIG_TCP_MD5SIG
636         struct tcp_md5sig_key *key;
637         struct tcp_md5sig_key tw_key;
638 #endif
639
640         memset(&rep.th, 0, sizeof(struct tcphdr));
641         memset(&arg, 0, sizeof(arg));
642
643         arg.iov[0].iov_base = (unsigned char *)&rep;
644         arg.iov[0].iov_len  = sizeof(rep.th);
645         if (ts) {
646                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
647                                    (TCPOPT_TIMESTAMP << 8) |
648                                    TCPOLEN_TIMESTAMP);
649                 rep.opt[1] = htonl(tcp_time_stamp);
650                 rep.opt[2] = htonl(ts);
651                 arg.iov[0].iov_len = TCPOLEN_TSTAMP_ALIGNED;
652         }
653
654         /* Swap the send and the receive. */
655         rep.th.dest    = th->source;
656         rep.th.source  = th->dest;
657         rep.th.doff    = arg.iov[0].iov_len / 4;
658         rep.th.seq     = htonl(seq);
659         rep.th.ack_seq = htonl(ack);
660         rep.th.ack     = 1;
661         rep.th.window  = htons(win);
662
663 #ifdef CONFIG_TCP_MD5SIG
664         /*
665          * The SKB holds an imcoming packet, but may not have a valid ->sk
666          * pointer. This is especially the case when we're dealing with a
667          * TIME_WAIT ack, because the sk structure is long gone, and only
668          * the tcp_timewait_sock remains. So the md5 key is stashed in that
669          * structure, and we use it in preference.  I believe that (twsk ||
670          * skb->sk) holds true, but we program defensively.
671          */
672         if (!twsk && skb->sk) {
673                 key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr);
674         } else if (twsk && twsk->tw_md5_keylen) {
675                 tw_key.key = twsk->tw_md5_key;
676                 tw_key.keylen = twsk->tw_md5_keylen;
677                 key = &tw_key;
678         } else
679                 key = NULL;
680
681         if (key) {
682                 int offset = (ts) ? 3 : 0;
683
684                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
685                                           (TCPOPT_NOP << 16) |
686                                           (TCPOPT_MD5SIG << 8) |
687                                           TCPOLEN_MD5SIG);
688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689                 rep.th.doff = arg.iov[0].iov_len/4;
690
691                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692                                         key,
693                                         skb->nh.iph->daddr,
694                                         skb->nh.iph->saddr,
695                                         &rep.th, IPPROTO_TCP,
696                                         arg.iov[0].iov_len);
697         }
698 #endif
699         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
700                                       skb->nh.iph->saddr, /* XXX */
701                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
702         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703
704         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
705
706         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
707 }
708
709 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
710 {
711         struct inet_timewait_sock *tw = inet_twsk(sk);
712         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
713
714         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
715                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
716                         tcptw->tw_ts_recent);
717
718         inet_twsk_put(tw);
719 }
720
721 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
722                                   struct request_sock *req)
723 {
724         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
725                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
726                         req->ts_recent);
727 }
728
729 /*
730  *      Send a SYN-ACK after having received an ACK.
731  *      This still operates on a request_sock only, not on a big
732  *      socket.
733  */
734 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
735                               struct dst_entry *dst)
736 {
737         const struct inet_request_sock *ireq = inet_rsk(req);
738         int err = -1;
739         struct sk_buff * skb;
740
741         /* First, grab a route. */
742         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
743                 goto out;
744
745         skb = tcp_make_synack(sk, dst, req);
746
747         if (skb) {
748                 struct tcphdr *th = skb->h.th;
749
750                 th->check = tcp_v4_check(th, skb->len,
751                                          ireq->loc_addr,
752                                          ireq->rmt_addr,
753                                          csum_partial((char *)th, skb->len,
754                                                       skb->csum));
755
756                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
757                                             ireq->rmt_addr,
758                                             ireq->opt);
759                 err = net_xmit_eval(err);
760         }
761
762 out:
763         dst_release(dst);
764         return err;
765 }
766
767 /*
768  *      IPv4 request_sock destructor.
769  */
770 static void tcp_v4_reqsk_destructor(struct request_sock *req)
771 {
772         kfree(inet_rsk(req)->opt);
773 }
774
775 #ifdef CONFIG_SYN_COOKIES
776 static void syn_flood_warning(struct sk_buff *skb)
777 {
778         static unsigned long warntime;
779
780         if (time_after(jiffies, (warntime + HZ * 60))) {
781                 warntime = jiffies;
782                 printk(KERN_INFO
783                        "possible SYN flooding on port %d. Sending cookies.\n",
784                        ntohs(skb->h.th->dest));
785         }
786 }
787 #endif
788
789 /*
790  * Save and compile IPv4 options into the request_sock if needed.
791  */
792 static struct ip_options *tcp_v4_save_options(struct sock *sk,
793                                               struct sk_buff *skb)
794 {
795         struct ip_options *opt = &(IPCB(skb)->opt);
796         struct ip_options *dopt = NULL;
797
798         if (opt && opt->optlen) {
799                 int opt_size = optlength(opt);
800                 dopt = kmalloc(opt_size, GFP_ATOMIC);
801                 if (dopt) {
802                         if (ip_options_echo(dopt, skb)) {
803                                 kfree(dopt);
804                                 dopt = NULL;
805                         }
806                 }
807         }
808         return dopt;
809 }
810
811 #ifdef CONFIG_TCP_MD5SIG
812 /*
813  * RFC2385 MD5 checksumming requires a mapping of
814  * IP address->MD5 Key.
815  * We need to maintain these in the sk structure.
816  */
817
818 /* Find the Key structure for an address.  */
819 static struct tcp_md5sig_key *
820                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
821 {
822         struct tcp_sock *tp = tcp_sk(sk);
823         int i;
824
825         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
826                 return NULL;
827         for (i = 0; i < tp->md5sig_info->entries4; i++) {
828                 if (tp->md5sig_info->keys4[i].addr == addr)
829                         return (struct tcp_md5sig_key *)
830                                                 &tp->md5sig_info->keys4[i];
831         }
832         return NULL;
833 }
834
835 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
836                                          struct sock *addr_sk)
837 {
838         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
839 }
840
841 EXPORT_SYMBOL(tcp_v4_md5_lookup);
842
843 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
844                                                       struct request_sock *req)
845 {
846         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
847 }
848
849 /* This can be called on a newly created socket, from other files */
850 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
851                       u8 *newkey, u8 newkeylen)
852 {
853         /* Add Key to the list */
854         struct tcp4_md5sig_key *key;
855         struct tcp_sock *tp = tcp_sk(sk);
856         struct tcp4_md5sig_key *keys;
857
858         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
859         if (key) {
860                 /* Pre-existing entry - just update that one. */
861                 kfree(key->key);
862                 key->key = newkey;
863                 key->keylen = newkeylen;
864         } else {
865                 struct tcp_md5sig_info *md5sig;
866
867                 if (!tp->md5sig_info) {
868                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
869                                                   GFP_ATOMIC);
870                         if (!tp->md5sig_info) {
871                                 kfree(newkey);
872                                 return -ENOMEM;
873                         }
874                 }
875                 if (tcp_alloc_md5sig_pool() == NULL) {
876                         kfree(newkey);
877                         return -ENOMEM;
878                 }
879                 md5sig = tp->md5sig_info;
880
881                 if (md5sig->alloced4 == md5sig->entries4) {
882                         keys = kmalloc((sizeof(*keys) *
883                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
884                         if (!keys) {
885                                 kfree(newkey);
886                                 tcp_free_md5sig_pool();
887                                 return -ENOMEM;
888                         }
889
890                         if (md5sig->entries4)
891                                 memcpy(keys, md5sig->keys4,
892                                        sizeof(*keys) * md5sig->entries4);
893
894                         /* Free old key list, and reference new one */
895                         if (md5sig->keys4)
896                                 kfree(md5sig->keys4);
897                         md5sig->keys4 = keys;
898                         md5sig->alloced4++;
899                 }
900                 md5sig->entries4++;
901                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
902                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
903                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
904         }
905         return 0;
906 }
907
908 EXPORT_SYMBOL(tcp_v4_md5_do_add);
909
910 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
911                                u8 *newkey, u8 newkeylen)
912 {
913         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
914                                  newkey, newkeylen);
915 }
916
917 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
918 {
919         struct tcp_sock *tp = tcp_sk(sk);
920         int i;
921
922         for (i = 0; i < tp->md5sig_info->entries4; i++) {
923                 if (tp->md5sig_info->keys4[i].addr == addr) {
924                         /* Free the key */
925                         kfree(tp->md5sig_info->keys4[i].key);
926                         tp->md5sig_info->entries4--;
927
928                         if (tp->md5sig_info->entries4 == 0) {
929                                 kfree(tp->md5sig_info->keys4);
930                                 tp->md5sig_info->keys4 = NULL;
931                         } else if (tp->md5sig_info->entries4 != i) {
932                                 /* Need to do some manipulation */
933                                 memcpy(&tp->md5sig_info->keys4[i],
934                                        &tp->md5sig_info->keys4[i+1],
935                                        (tp->md5sig_info->entries4 - i) *
936                                         sizeof(struct tcp4_md5sig_key));
937                         }
938                         tcp_free_md5sig_pool();
939                         return 0;
940                 }
941         }
942         return -ENOENT;
943 }
944
945 EXPORT_SYMBOL(tcp_v4_md5_do_del);
946
947 static void tcp_v4_clear_md5_list(struct sock *sk)
948 {
949         struct tcp_sock *tp = tcp_sk(sk);
950
951         /* Free each key, then the set of key keys,
952          * the crypto element, and then decrement our
953          * hold on the last resort crypto.
954          */
955         if (tp->md5sig_info->entries4) {
956                 int i;
957                 for (i = 0; i < tp->md5sig_info->entries4; i++)
958                         kfree(tp->md5sig_info->keys4[i].key);
959                 tp->md5sig_info->entries4 = 0;
960                 tcp_free_md5sig_pool();
961         }
962         if (tp->md5sig_info->keys4) {
963                 kfree(tp->md5sig_info->keys4);
964                 tp->md5sig_info->keys4 = NULL;
965                 tp->md5sig_info->alloced4  = 0;
966         }
967 }
968
969 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
970                                  int optlen)
971 {
972         struct tcp_md5sig cmd;
973         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
974         u8 *newkey;
975
976         if (optlen < sizeof(cmd))
977                 return -EINVAL;
978
979         if (copy_from_user(&cmd, optval, sizeof(cmd)))
980                 return -EFAULT;
981
982         if (sin->sin_family != AF_INET)
983                 return -EINVAL;
984
985         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
986                 if (!tcp_sk(sk)->md5sig_info)
987                         return -ENOENT;
988                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
989         }
990
991         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
992                 return -EINVAL;
993
994         if (!tcp_sk(sk)->md5sig_info) {
995                 struct tcp_sock *tp = tcp_sk(sk);
996                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
997
998                 if (!p)
999                         return -EINVAL;
1000
1001                 tp->md5sig_info = p;
1002
1003         }
1004
1005         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1006         if (!newkey)
1007                 return -ENOMEM;
1008         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1009                                  newkey, cmd.tcpm_keylen);
1010 }
1011
1012 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1013                                    __be32 saddr, __be32 daddr,
1014                                    struct tcphdr *th, int protocol,
1015                                    int tcplen)
1016 {
1017         struct scatterlist sg[4];
1018         __u16 data_len;
1019         int block = 0;
1020         __sum16 old_checksum;
1021         struct tcp_md5sig_pool *hp;
1022         struct tcp4_pseudohdr *bp;
1023         struct hash_desc *desc;
1024         int err;
1025         unsigned int nbytes = 0;
1026
1027         /*
1028          * Okay, so RFC2385 is turned on for this connection,
1029          * so we need to generate the MD5 hash for the packet now.
1030          */
1031
1032         hp = tcp_get_md5sig_pool();
1033         if (!hp)
1034                 goto clear_hash_noput;
1035
1036         bp = &hp->md5_blk.ip4;
1037         desc = &hp->md5_desc;
1038
1039         /*
1040          * 1. the TCP pseudo-header (in the order: source IP address,
1041          * destination IP address, zero-padded protocol number, and
1042          * segment length)
1043          */
1044         bp->saddr = saddr;
1045         bp->daddr = daddr;
1046         bp->pad = 0;
1047         bp->protocol = protocol;
1048         bp->len = htons(tcplen);
1049         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1050         nbytes += sizeof(*bp);
1051
1052         /* 2. the TCP header, excluding options, and assuming a
1053          * checksum of zero/
1054          */
1055         old_checksum = th->check;
1056         th->check = 0;
1057         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1058         nbytes += sizeof(struct tcphdr);
1059
1060         /* 3. the TCP segment data (if any) */
1061         data_len = tcplen - (th->doff << 2);
1062         if (data_len > 0) {
1063                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1064                 sg_set_buf(&sg[block++], data, data_len);
1065                 nbytes += data_len;
1066         }
1067
1068         /* 4. an independently-specified key or password, known to both
1069          * TCPs and presumably connection-specific
1070          */
1071         sg_set_buf(&sg[block++], key->key, key->keylen);
1072         nbytes += key->keylen;
1073
1074         /* Now store the Hash into the packet */
1075         err = crypto_hash_init(desc);
1076         if (err)
1077                 goto clear_hash;
1078         err = crypto_hash_update(desc, sg, nbytes);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_final(desc, md5_hash);
1082         if (err)
1083                 goto clear_hash;
1084
1085         /* Reset header, and free up the crypto */
1086         tcp_put_md5sig_pool();
1087         th->check = old_checksum;
1088
1089 out:
1090         return 0;
1091 clear_hash:
1092         tcp_put_md5sig_pool();
1093 clear_hash_noput:
1094         memset(md5_hash, 0, 16);
1095         goto out;
1096 }
1097
1098 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1099                          struct sock *sk,
1100                          struct dst_entry *dst,
1101                          struct request_sock *req,
1102                          struct tcphdr *th, int protocol,
1103                          int tcplen)
1104 {
1105         __be32 saddr, daddr;
1106
1107         if (sk) {
1108                 saddr = inet_sk(sk)->saddr;
1109                 daddr = inet_sk(sk)->daddr;
1110         } else {
1111                 struct rtable *rt = (struct rtable *)dst;
1112                 BUG_ON(!rt);
1113                 saddr = rt->rt_src;
1114                 daddr = rt->rt_dst;
1115         }
1116         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1117                                        saddr, daddr,
1118                                        th, protocol, tcplen);
1119 }
1120
1121 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1122
1123 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1124 {
1125         /*
1126          * This gets called for each TCP segment that arrives
1127          * so we want to be efficient.
1128          * We have 3 drop cases:
1129          * o No MD5 hash and one expected.
1130          * o MD5 hash and we're not expecting one.
1131          * o MD5 hash and its wrong.
1132          */
1133         __u8 *hash_location = NULL;
1134         struct tcp_md5sig_key *hash_expected;
1135         struct iphdr *iph = skb->nh.iph;
1136         struct tcphdr *th = skb->h.th;
1137         int length = (th->doff << 2) - sizeof(struct tcphdr);
1138         int genhash;
1139         unsigned char *ptr;
1140         unsigned char newhash[16];
1141
1142         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1143
1144         /*
1145          * If the TCP option length is less than the TCP_MD5SIG
1146          * option length, then we can shortcut
1147          */
1148         if (length < TCPOLEN_MD5SIG) {
1149                 if (hash_expected)
1150                         return 1;
1151                 else
1152                         return 0;
1153         }
1154
1155         /* Okay, we can't shortcut - we have to grub through the options */
1156         ptr = (unsigned char *)(th + 1);
1157         while (length > 0) {
1158                 int opcode = *ptr++;
1159                 int opsize;
1160
1161                 switch (opcode) {
1162                 case TCPOPT_EOL:
1163                         goto done_opts;
1164                 case TCPOPT_NOP:
1165                         length--;
1166                         continue;
1167                 default:
1168                         opsize = *ptr++;
1169                         if (opsize < 2)
1170                                 goto done_opts;
1171                         if (opsize > length)
1172                                 goto done_opts;
1173
1174                         if (opcode == TCPOPT_MD5SIG) {
1175                                 hash_location = ptr;
1176                                 goto done_opts;
1177                         }
1178                 }
1179                 ptr += opsize-2;
1180                 length -= opsize;
1181         }
1182 done_opts:
1183         /* We've parsed the options - do we have a hash? */
1184         if (!hash_expected && !hash_location)
1185                 return 0;
1186
1187         if (hash_expected && !hash_location) {
1188                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1189                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1190                                NIPQUAD(iph->saddr), ntohs(th->source),
1191                                NIPQUAD(iph->daddr), ntohs(th->dest));
1192                 return 1;
1193         }
1194
1195         if (!hash_expected && hash_location) {
1196                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1197                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1198                                NIPQUAD(iph->saddr), ntohs(th->source),
1199                                NIPQUAD(iph->daddr), ntohs(th->dest));
1200                 return 1;
1201         }
1202
1203         /* Okay, so this is hash_expected and hash_location -
1204          * so we need to calculate the checksum.
1205          */
1206         genhash = tcp_v4_do_calc_md5_hash(newhash,
1207                                           hash_expected,
1208                                           iph->saddr, iph->daddr,
1209                                           th, sk->sk_protocol,
1210                                           skb->len);
1211
1212         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1213                 if (net_ratelimit()) {
1214                         printk(KERN_INFO "MD5 Hash failed for "
1215                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1216                                NIPQUAD(iph->saddr), ntohs(th->source),
1217                                NIPQUAD(iph->daddr), ntohs(th->dest),
1218                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1219                 }
1220                 return 1;
1221         }
1222         return 0;
1223 }
1224
1225 #endif
1226
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228         .family         =       PF_INET,
1229         .obj_size       =       sizeof(struct tcp_request_sock),
1230         .rtx_syn_ack    =       tcp_v4_send_synack,
1231         .send_ack       =       tcp_v4_reqsk_send_ack,
1232         .destructor     =       tcp_v4_reqsk_destructor,
1233         .send_reset     =       tcp_v4_send_reset,
1234 };
1235
1236 #ifdef CONFIG_TCP_MD5SIG
1237 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1239 };
1240 #endif
1241
1242 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1243         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1244         .twsk_unique    = tcp_twsk_unique,
1245         .twsk_destructor= tcp_twsk_destructor,
1246 };
1247
1248 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1249 {
1250         struct inet_request_sock *ireq;
1251         struct tcp_options_received tmp_opt;
1252         struct request_sock *req;
1253         __be32 saddr = skb->nh.iph->saddr;
1254         __be32 daddr = skb->nh.iph->daddr;
1255         __u32 isn = TCP_SKB_CB(skb)->when;
1256         struct dst_entry *dst = NULL;
1257 #ifdef CONFIG_SYN_COOKIES
1258         int want_cookie = 0;
1259 #else
1260 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1261 #endif
1262
1263         /* Never answer to SYNs send to broadcast or multicast */
1264         if (((struct rtable *)skb->dst)->rt_flags &
1265             (RTCF_BROADCAST | RTCF_MULTICAST))
1266                 goto drop;
1267
1268         /* TW buckets are converted to open requests without
1269          * limitations, they conserve resources and peer is
1270          * evidently real one.
1271          */
1272         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1273 #ifdef CONFIG_SYN_COOKIES
1274                 if (sysctl_tcp_syncookies) {
1275                         want_cookie = 1;
1276                 } else
1277 #endif
1278                 goto drop;
1279         }
1280
1281         /* Accept backlog is full. If we have already queued enough
1282          * of warm entries in syn queue, drop request. It is better than
1283          * clogging syn queue with openreqs with exponentially increasing
1284          * timeout.
1285          */
1286         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1287                 goto drop;
1288
1289         req = reqsk_alloc(&tcp_request_sock_ops);
1290         if (!req)
1291                 goto drop;
1292
1293 #ifdef CONFIG_TCP_MD5SIG
1294         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1295 #endif
1296
1297         tcp_clear_options(&tmp_opt);
1298         tmp_opt.mss_clamp = 536;
1299         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1300
1301         tcp_parse_options(skb, &tmp_opt, 0);
1302
1303         if (want_cookie) {
1304                 tcp_clear_options(&tmp_opt);
1305                 tmp_opt.saw_tstamp = 0;
1306         }
1307
1308         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1309                 /* Some OSes (unknown ones, but I see them on web server, which
1310                  * contains information interesting only for windows'
1311                  * users) do not send their stamp in SYN. It is easy case.
1312                  * We simply do not advertise TS support.
1313                  */
1314                 tmp_opt.saw_tstamp = 0;
1315                 tmp_opt.tstamp_ok  = 0;
1316         }
1317         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1318
1319         tcp_openreq_init(req, &tmp_opt, skb);
1320
1321         if (security_inet_conn_request(sk, skb, req))
1322                 goto drop_and_free;
1323
1324         ireq = inet_rsk(req);
1325         ireq->loc_addr = daddr;
1326         ireq->rmt_addr = saddr;
1327         ireq->opt = tcp_v4_save_options(sk, skb);
1328         if (!want_cookie)
1329                 TCP_ECN_create_request(req, skb->h.th);
1330
1331         if (want_cookie) {
1332 #ifdef CONFIG_SYN_COOKIES
1333                 syn_flood_warning(skb);
1334 #endif
1335                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1336         } else if (!isn) {
1337                 struct inet_peer *peer = NULL;
1338
1339                 /* VJ's idea. We save last timestamp seen
1340                  * from the destination in peer table, when entering
1341                  * state TIME-WAIT, and check against it before
1342                  * accepting new connection request.
1343                  *
1344                  * If "isn" is not zero, this request hit alive
1345                  * timewait bucket, so that all the necessary checks
1346                  * are made in the function processing timewait state.
1347                  */
1348                 if (tmp_opt.saw_tstamp &&
1349                     tcp_death_row.sysctl_tw_recycle &&
1350                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1351                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1352                     peer->v4daddr == saddr) {
1353                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1354                             (s32)(peer->tcp_ts - req->ts_recent) >
1355                                                         TCP_PAWS_WINDOW) {
1356                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1357                                 dst_release(dst);
1358                                 goto drop_and_free;
1359                         }
1360                 }
1361                 /* Kill the following clause, if you dislike this way. */
1362                 else if (!sysctl_tcp_syncookies &&
1363                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1364                           (sysctl_max_syn_backlog >> 2)) &&
1365                          (!peer || !peer->tcp_ts_stamp) &&
1366                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1367                         /* Without syncookies last quarter of
1368                          * backlog is filled with destinations,
1369                          * proven to be alive.
1370                          * It means that we continue to communicate
1371                          * to destinations, already remembered
1372                          * to the moment of synflood.
1373                          */
1374                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1375                                        "request from %u.%u.%u.%u/%u\n",
1376                                        NIPQUAD(saddr),
1377                                        ntohs(skb->h.th->source));
1378                         dst_release(dst);
1379                         goto drop_and_free;
1380                 }
1381
1382                 isn = tcp_v4_init_sequence(skb);
1383         }
1384         tcp_rsk(req)->snt_isn = isn;
1385
1386         if (tcp_v4_send_synack(sk, req, dst))
1387                 goto drop_and_free;
1388
1389         if (want_cookie) {
1390                 reqsk_free(req);
1391         } else {
1392                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1393         }
1394         return 0;
1395
1396 drop_and_free:
1397         reqsk_free(req);
1398 drop:
1399         return 0;
1400 }
1401
1402
1403 /*
1404  * The three way handshake has completed - we got a valid synack -
1405  * now create the new socket.
1406  */
1407 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1408                                   struct request_sock *req,
1409                                   struct dst_entry *dst)
1410 {
1411         struct inet_request_sock *ireq;
1412         struct inet_sock *newinet;
1413         struct tcp_sock *newtp;
1414         struct sock *newsk;
1415 #ifdef CONFIG_TCP_MD5SIG
1416         struct tcp_md5sig_key *key;
1417 #endif
1418
1419         if (sk_acceptq_is_full(sk))
1420                 goto exit_overflow;
1421
1422         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1423                 goto exit;
1424
1425         newsk = tcp_create_openreq_child(sk, req, skb);
1426         if (!newsk)
1427                 goto exit;
1428
1429         newsk->sk_gso_type = SKB_GSO_TCPV4;
1430         sk_setup_caps(newsk, dst);
1431
1432         newtp                 = tcp_sk(newsk);
1433         newinet               = inet_sk(newsk);
1434         ireq                  = inet_rsk(req);
1435         newinet->daddr        = ireq->rmt_addr;
1436         newinet->rcv_saddr    = ireq->loc_addr;
1437         newinet->saddr        = ireq->loc_addr;
1438         newinet->opt          = ireq->opt;
1439         ireq->opt             = NULL;
1440         newinet->mc_index     = inet_iif(skb);
1441         newinet->mc_ttl       = skb->nh.iph->ttl;
1442         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1443         if (newinet->opt)
1444                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1445         newinet->id = newtp->write_seq ^ jiffies;
1446
1447         tcp_mtup_init(newsk);
1448         tcp_sync_mss(newsk, dst_mtu(dst));
1449         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1450         tcp_initialize_rcv_mss(newsk);
1451
1452 #ifdef CONFIG_TCP_MD5SIG
1453         /* Copy over the MD5 key from the original socket */
1454         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1455                 /*
1456                  * We're using one, so create a matching key
1457                  * on the newsk structure. If we fail to get
1458                  * memory, then we end up not copying the key
1459                  * across. Shucks.
1460                  */
1461                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1462                 if (newkey != NULL)
1463                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1464                                           newkey, key->keylen);
1465         }
1466 #endif
1467
1468         __inet_hash(&tcp_hashinfo, newsk, 0);
1469         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1470
1471         return newsk;
1472
1473 exit_overflow:
1474         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1475 exit:
1476         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1477         dst_release(dst);
1478         return NULL;
1479 }
1480
1481 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1482 {
1483         struct tcphdr *th = skb->h.th;
1484         struct iphdr *iph = skb->nh.iph;
1485         struct sock *nsk;
1486         struct request_sock **prev;
1487         /* Find possible connection requests. */
1488         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1489                                                        iph->saddr, iph->daddr);
1490         if (req)
1491                 return tcp_check_req(sk, skb, req, prev);
1492
1493         nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1494                                       th->source, skb->nh.iph->daddr,
1495                                       th->dest, inet_iif(skb));
1496
1497         if (nsk) {
1498                 if (nsk->sk_state != TCP_TIME_WAIT) {
1499                         bh_lock_sock(nsk);
1500                         return nsk;
1501                 }
1502                 inet_twsk_put(inet_twsk(nsk));
1503                 return NULL;
1504         }
1505
1506 #ifdef CONFIG_SYN_COOKIES
1507         if (!th->rst && !th->syn && th->ack)
1508                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1509 #endif
1510         return sk;
1511 }
1512
1513 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1514 {
1515         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1516                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1517                                   skb->nh.iph->daddr, skb->csum)) {
1518                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1519                         return 0;
1520                 }
1521         }
1522
1523         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1524                                        skb->len, IPPROTO_TCP, 0);
1525
1526         if (skb->len <= 76) {
1527                 return __skb_checksum_complete(skb);
1528         }
1529         return 0;
1530 }
1531
1532
1533 /* The socket must have it's spinlock held when we get
1534  * here.
1535  *
1536  * We have a potential double-lock case here, so even when
1537  * doing backlog processing we use the BH locking scheme.
1538  * This is because we cannot sleep with the original spinlock
1539  * held.
1540  */
1541 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1542 {
1543         struct sock *rsk;
1544 #ifdef CONFIG_TCP_MD5SIG
1545         /*
1546          * We really want to reject the packet as early as possible
1547          * if:
1548          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1549          *  o There is an MD5 option and we're not expecting one
1550          */
1551         if (tcp_v4_inbound_md5_hash(sk, skb))
1552                 goto discard;
1553 #endif
1554
1555         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1556                 TCP_CHECK_TIMER(sk);
1557                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) {
1558                         rsk = sk;
1559                         goto reset;
1560                 }
1561                 TCP_CHECK_TIMER(sk);
1562                 return 0;
1563         }
1564
1565         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1566                 goto csum_err;
1567
1568         if (sk->sk_state == TCP_LISTEN) {
1569                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1570                 if (!nsk)
1571                         goto discard;
1572
1573                 if (nsk != sk) {
1574                         if (tcp_child_process(sk, nsk, skb)) {
1575                                 rsk = nsk;
1576                                 goto reset;
1577                         }
1578                         return 0;
1579                 }
1580         }
1581
1582         TCP_CHECK_TIMER(sk);
1583         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) {
1584                 rsk = sk;
1585                 goto reset;
1586         }
1587         TCP_CHECK_TIMER(sk);
1588         return 0;
1589
1590 reset:
1591         tcp_v4_send_reset(rsk, skb);
1592 discard:
1593         kfree_skb(skb);
1594         /* Be careful here. If this function gets more complicated and
1595          * gcc suffers from register pressure on the x86, sk (in %ebx)
1596          * might be destroyed here. This current version compiles correctly,
1597          * but you have been warned.
1598          */
1599         return 0;
1600
1601 csum_err:
1602         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1603         goto discard;
1604 }
1605
1606 /*
1607  *      From tcp_input.c
1608  */
1609
1610 int tcp_v4_rcv(struct sk_buff *skb)
1611 {
1612         struct tcphdr *th;
1613         struct sock *sk;
1614         int ret;
1615
1616         if (skb->pkt_type != PACKET_HOST)
1617                 goto discard_it;
1618
1619         /* Count it even if it's bad */
1620         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1621
1622         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1623                 goto discard_it;
1624
1625         th = skb->h.th;
1626
1627         if (th->doff < sizeof(struct tcphdr) / 4)
1628                 goto bad_packet;
1629         if (!pskb_may_pull(skb, th->doff * 4))
1630                 goto discard_it;
1631
1632         /* An explanation is required here, I think.
1633          * Packet length and doff are validated by header prediction,
1634          * provided case of th->doff==0 is eliminated.
1635          * So, we defer the checks. */
1636         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1637              tcp_v4_checksum_init(skb)))
1638                 goto bad_packet;
1639
1640         th = skb->h.th;
1641         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1642         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1643                                     skb->len - th->doff * 4);
1644         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1645         TCP_SKB_CB(skb)->when    = 0;
1646         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1647         TCP_SKB_CB(skb)->sacked  = 0;
1648
1649         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1650                            skb->nh.iph->daddr, th->dest,
1651                            inet_iif(skb));
1652
1653         if (!sk)
1654                 goto no_tcp_socket;
1655
1656 process:
1657         if (sk->sk_state == TCP_TIME_WAIT)
1658                 goto do_time_wait;
1659
1660         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1661                 goto discard_and_relse;
1662         nf_reset(skb);
1663
1664         if (sk_filter(sk, skb))
1665                 goto discard_and_relse;
1666
1667         skb->dev = NULL;
1668
1669         bh_lock_sock_nested(sk);
1670         ret = 0;
1671         if (!sock_owned_by_user(sk)) {
1672 #ifdef CONFIG_NET_DMA
1673                 struct tcp_sock *tp = tcp_sk(sk);
1674                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1675                         tp->ucopy.dma_chan = get_softnet_dma();
1676                 if (tp->ucopy.dma_chan)
1677                         ret = tcp_v4_do_rcv(sk, skb);
1678                 else
1679 #endif
1680                 {
1681                         if (!tcp_prequeue(sk, skb))
1682                         ret = tcp_v4_do_rcv(sk, skb);
1683                 }
1684         } else
1685                 sk_add_backlog(sk, skb);
1686         bh_unlock_sock(sk);
1687
1688         sock_put(sk);
1689
1690         return ret;
1691
1692 no_tcp_socket:
1693         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1694                 goto discard_it;
1695
1696         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1697 bad_packet:
1698                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1699         } else {
1700                 tcp_v4_send_reset(NULL, skb);
1701         }
1702
1703 discard_it:
1704         /* Discard frame. */
1705         kfree_skb(skb);
1706         return 0;
1707
1708 discard_and_relse:
1709         sock_put(sk);
1710         goto discard_it;
1711
1712 do_time_wait:
1713         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1714                 inet_twsk_put(inet_twsk(sk));
1715                 goto discard_it;
1716         }
1717
1718         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1719                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1720                 inet_twsk_put(inet_twsk(sk));
1721                 goto discard_it;
1722         }
1723         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1724         case TCP_TW_SYN: {
1725                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1726                                                         skb->nh.iph->daddr,
1727                                                         th->dest,
1728                                                         inet_iif(skb));
1729                 if (sk2) {
1730                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1731                         inet_twsk_put(inet_twsk(sk));
1732                         sk = sk2;
1733                         goto process;
1734                 }
1735                 /* Fall through to ACK */
1736         }
1737         case TCP_TW_ACK:
1738                 tcp_v4_timewait_ack(sk, skb);
1739                 break;
1740         case TCP_TW_RST:
1741                 goto no_tcp_socket;
1742         case TCP_TW_SUCCESS:;
1743         }
1744         goto discard_it;
1745 }
1746
1747 /* VJ's idea. Save last timestamp seen from this destination
1748  * and hold it at least for normal timewait interval to use for duplicate
1749  * segment detection in subsequent connections, before they enter synchronized
1750  * state.
1751  */
1752
1753 int tcp_v4_remember_stamp(struct sock *sk)
1754 {
1755         struct inet_sock *inet = inet_sk(sk);
1756         struct tcp_sock *tp = tcp_sk(sk);
1757         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1758         struct inet_peer *peer = NULL;
1759         int release_it = 0;
1760
1761         if (!rt || rt->rt_dst != inet->daddr) {
1762                 peer = inet_getpeer(inet->daddr, 1);
1763                 release_it = 1;
1764         } else {
1765                 if (!rt->peer)
1766                         rt_bind_peer(rt, 1);
1767                 peer = rt->peer;
1768         }
1769
1770         if (peer) {
1771                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1772                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1773                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1774                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1775                         peer->tcp_ts = tp->rx_opt.ts_recent;
1776                 }
1777                 if (release_it)
1778                         inet_putpeer(peer);
1779                 return 1;
1780         }
1781
1782         return 0;
1783 }
1784
1785 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1786 {
1787         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1788
1789         if (peer) {
1790                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1791
1792                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1793                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1794                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1795                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1796                         peer->tcp_ts       = tcptw->tw_ts_recent;
1797                 }
1798                 inet_putpeer(peer);
1799                 return 1;
1800         }
1801
1802         return 0;
1803 }
1804
1805 struct inet_connection_sock_af_ops ipv4_specific = {
1806         .queue_xmit        = ip_queue_xmit,
1807         .send_check        = tcp_v4_send_check,
1808         .rebuild_header    = inet_sk_rebuild_header,
1809         .conn_request      = tcp_v4_conn_request,
1810         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1811         .remember_stamp    = tcp_v4_remember_stamp,
1812         .net_header_len    = sizeof(struct iphdr),
1813         .setsockopt        = ip_setsockopt,
1814         .getsockopt        = ip_getsockopt,
1815         .addr2sockaddr     = inet_csk_addr2sockaddr,
1816         .sockaddr_len      = sizeof(struct sockaddr_in),
1817 #ifdef CONFIG_COMPAT
1818         .compat_setsockopt = compat_ip_setsockopt,
1819         .compat_getsockopt = compat_ip_getsockopt,
1820 #endif
1821 };
1822
1823 #ifdef CONFIG_TCP_MD5SIG
1824 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1825         .md5_lookup             = tcp_v4_md5_lookup,
1826         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1827         .md5_add                = tcp_v4_md5_add_func,
1828         .md5_parse              = tcp_v4_parse_md5_keys,
1829 };
1830 #endif
1831
1832 /* NOTE: A lot of things set to zero explicitly by call to
1833  *       sk_alloc() so need not be done here.
1834  */
1835 static int tcp_v4_init_sock(struct sock *sk)
1836 {
1837         struct inet_connection_sock *icsk = inet_csk(sk);
1838         struct tcp_sock *tp = tcp_sk(sk);
1839
1840         skb_queue_head_init(&tp->out_of_order_queue);
1841         tcp_init_xmit_timers(sk);
1842         tcp_prequeue_init(tp);
1843
1844         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1845         tp->mdev = TCP_TIMEOUT_INIT;
1846
1847         /* So many TCP implementations out there (incorrectly) count the
1848          * initial SYN frame in their delayed-ACK and congestion control
1849          * algorithms that we must have the following bandaid to talk
1850          * efficiently to them.  -DaveM
1851          */
1852         tp->snd_cwnd = 2;
1853
1854         /* See draft-stevens-tcpca-spec-01 for discussion of the
1855          * initialization of these values.
1856          */
1857         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1858         tp->snd_cwnd_clamp = ~0;
1859         tp->mss_cache = 536;
1860
1861         tp->reordering = sysctl_tcp_reordering;
1862         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1863
1864         sk->sk_state = TCP_CLOSE;
1865
1866         sk->sk_write_space = sk_stream_write_space;
1867         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1868
1869         icsk->icsk_af_ops = &ipv4_specific;
1870         icsk->icsk_sync_mss = tcp_sync_mss;
1871 #ifdef CONFIG_TCP_MD5SIG
1872         tp->af_specific = &tcp_sock_ipv4_specific;
1873 #endif
1874
1875         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1876         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1877
1878         atomic_inc(&tcp_sockets_allocated);
1879
1880         return 0;
1881 }
1882
1883 int tcp_v4_destroy_sock(struct sock *sk)
1884 {
1885         struct tcp_sock *tp = tcp_sk(sk);
1886
1887         tcp_clear_xmit_timers(sk);
1888
1889         tcp_cleanup_congestion_control(sk);
1890
1891         /* Cleanup up the write buffer. */
1892         sk_stream_writequeue_purge(sk);
1893
1894         /* Cleans up our, hopefully empty, out_of_order_queue. */
1895         __skb_queue_purge(&tp->out_of_order_queue);
1896
1897 #ifdef CONFIG_TCP_MD5SIG
1898         /* Clean up the MD5 key list, if any */
1899         if (tp->md5sig_info) {
1900                 tcp_v4_clear_md5_list(sk);
1901                 kfree(tp->md5sig_info);
1902                 tp->md5sig_info = NULL;
1903         }
1904 #endif
1905
1906 #ifdef CONFIG_NET_DMA
1907         /* Cleans up our sk_async_wait_queue */
1908         __skb_queue_purge(&sk->sk_async_wait_queue);
1909 #endif
1910
1911         /* Clean prequeue, it must be empty really */
1912         __skb_queue_purge(&tp->ucopy.prequeue);
1913
1914         /* Clean up a referenced TCP bind bucket. */
1915         if (inet_csk(sk)->icsk_bind_hash)
1916                 inet_put_port(&tcp_hashinfo, sk);
1917
1918         /*
1919          * If sendmsg cached page exists, toss it.
1920          */
1921         if (sk->sk_sndmsg_page) {
1922                 __free_page(sk->sk_sndmsg_page);
1923                 sk->sk_sndmsg_page = NULL;
1924         }
1925
1926         atomic_dec(&tcp_sockets_allocated);
1927
1928         return 0;
1929 }
1930
1931 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1932
1933 #ifdef CONFIG_PROC_FS
1934 /* Proc filesystem TCP sock list dumping. */
1935
1936 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1937 {
1938         return hlist_empty(head) ? NULL :
1939                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1940 }
1941
1942 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1943 {
1944         return tw->tw_node.next ?
1945                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1946 }
1947
1948 static void *listening_get_next(struct seq_file *seq, void *cur)
1949 {
1950         struct inet_connection_sock *icsk;
1951         struct hlist_node *node;
1952         struct sock *sk = cur;
1953         struct tcp_iter_state* st = seq->private;
1954
1955         if (!sk) {
1956                 st->bucket = 0;
1957                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1958                 goto get_sk;
1959         }
1960
1961         ++st->num;
1962
1963         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1964                 struct request_sock *req = cur;
1965
1966                 icsk = inet_csk(st->syn_wait_sk);
1967                 req = req->dl_next;
1968                 while (1) {
1969                         while (req) {
1970                                 if (req->rsk_ops->family == st->family) {
1971                                         cur = req;
1972                                         goto out;
1973                                 }
1974                                 req = req->dl_next;
1975                         }
1976                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1977                                 break;
1978 get_req:
1979                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1980                 }
1981                 sk        = sk_next(st->syn_wait_sk);
1982                 st->state = TCP_SEQ_STATE_LISTENING;
1983                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1984         } else {
1985                 icsk = inet_csk(sk);
1986                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1987                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1988                         goto start_req;
1989                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1990                 sk = sk_next(sk);
1991         }
1992 get_sk:
1993         sk_for_each_from(sk, node) {
1994                 if (sk->sk_family == st->family) {
1995                         cur = sk;
1996                         goto out;
1997                 }
1998                 icsk = inet_csk(sk);
1999                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2000                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2001 start_req:
2002                         st->uid         = sock_i_uid(sk);
2003                         st->syn_wait_sk = sk;
2004                         st->state       = TCP_SEQ_STATE_OPENREQ;
2005                         st->sbucket     = 0;
2006                         goto get_req;
2007                 }
2008                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009         }
2010         if (++st->bucket < INET_LHTABLE_SIZE) {
2011                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2012                 goto get_sk;
2013         }
2014         cur = NULL;
2015 out:
2016         return cur;
2017 }
2018
2019 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2020 {
2021         void *rc = listening_get_next(seq, NULL);
2022
2023         while (rc && *pos) {
2024                 rc = listening_get_next(seq, rc);
2025                 --*pos;
2026         }
2027         return rc;
2028 }
2029
2030 static void *established_get_first(struct seq_file *seq)
2031 {
2032         struct tcp_iter_state* st = seq->private;
2033         void *rc = NULL;
2034
2035         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2036                 struct sock *sk;
2037                 struct hlist_node *node;
2038                 struct inet_timewait_sock *tw;
2039
2040                 /* We can reschedule _before_ having picked the target: */
2041                 cond_resched_softirq();
2042
2043                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2044                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2045                         if (sk->sk_family != st->family) {
2046                                 continue;
2047                         }
2048                         rc = sk;
2049                         goto out;
2050                 }
2051                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2052                 inet_twsk_for_each(tw, node,
2053                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
2054                         if (tw->tw_family != st->family) {
2055                                 continue;
2056                         }
2057                         rc = tw;
2058                         goto out;
2059                 }
2060                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2061                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062         }
2063 out:
2064         return rc;
2065 }
2066
2067 static void *established_get_next(struct seq_file *seq, void *cur)
2068 {
2069         struct sock *sk = cur;
2070         struct inet_timewait_sock *tw;
2071         struct hlist_node *node;
2072         struct tcp_iter_state* st = seq->private;
2073
2074         ++st->num;
2075
2076         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2077                 tw = cur;
2078                 tw = tw_next(tw);
2079 get_tw:
2080                 while (tw && tw->tw_family != st->family) {
2081                         tw = tw_next(tw);
2082                 }
2083                 if (tw) {
2084                         cur = tw;
2085                         goto out;
2086                 }
2087                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2088                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2089
2090                 /* We can reschedule between buckets: */
2091                 cond_resched_softirq();
2092
2093                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2094                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2095                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2096                 } else {
2097                         cur = NULL;
2098                         goto out;
2099                 }
2100         } else
2101                 sk = sk_next(sk);
2102
2103         sk_for_each_from(sk, node) {
2104                 if (sk->sk_family == st->family)
2105                         goto found;
2106         }
2107
2108         st->state = TCP_SEQ_STATE_TIME_WAIT;
2109         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
2110         goto get_tw;
2111 found:
2112         cur = sk;
2113 out:
2114         return cur;
2115 }
2116
2117 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2118 {
2119         void *rc = established_get_first(seq);
2120
2121         while (rc && pos) {
2122                 rc = established_get_next(seq, rc);
2123                 --pos;
2124         }
2125         return rc;
2126 }
2127
2128 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2129 {
2130         void *rc;
2131         struct tcp_iter_state* st = seq->private;
2132
2133         inet_listen_lock(&tcp_hashinfo);
2134         st->state = TCP_SEQ_STATE_LISTENING;
2135         rc        = listening_get_idx(seq, &pos);
2136
2137         if (!rc) {
2138                 inet_listen_unlock(&tcp_hashinfo);
2139                 local_bh_disable();
2140                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2141                 rc        = established_get_idx(seq, pos);
2142         }
2143
2144         return rc;
2145 }
2146
2147 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2148 {
2149         struct tcp_iter_state* st = seq->private;
2150         st->state = TCP_SEQ_STATE_LISTENING;
2151         st->num = 0;
2152         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2153 }
2154
2155 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2156 {
2157         void *rc = NULL;
2158         struct tcp_iter_state* st;
2159
2160         if (v == SEQ_START_TOKEN) {
2161                 rc = tcp_get_idx(seq, 0);
2162                 goto out;
2163         }
2164         st = seq->private;
2165
2166         switch (st->state) {
2167         case TCP_SEQ_STATE_OPENREQ:
2168         case TCP_SEQ_STATE_LISTENING:
2169                 rc = listening_get_next(seq, v);
2170                 if (!rc) {
2171                         inet_listen_unlock(&tcp_hashinfo);
2172                         local_bh_disable();
2173                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2174                         rc        = established_get_first(seq);
2175                 }
2176                 break;
2177         case TCP_SEQ_STATE_ESTABLISHED:
2178         case TCP_SEQ_STATE_TIME_WAIT:
2179                 rc = established_get_next(seq, v);
2180                 break;
2181         }
2182 out:
2183         ++*pos;
2184         return rc;
2185 }
2186
2187 static void tcp_seq_stop(struct seq_file *seq, void *v)
2188 {
2189         struct tcp_iter_state* st = seq->private;
2190
2191         switch (st->state) {
2192         case TCP_SEQ_STATE_OPENREQ:
2193                 if (v) {
2194                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2195                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2196                 }
2197         case TCP_SEQ_STATE_LISTENING:
2198                 if (v != SEQ_START_TOKEN)
2199                         inet_listen_unlock(&tcp_hashinfo);
2200                 break;
2201         case TCP_SEQ_STATE_TIME_WAIT:
2202         case TCP_SEQ_STATE_ESTABLISHED:
2203                 if (v)
2204                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2205                 local_bh_enable();
2206                 break;
2207         }
2208 }
2209
2210 static int tcp_seq_open(struct inode *inode, struct file *file)
2211 {
2212         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2213         struct seq_file *seq;
2214         struct tcp_iter_state *s;
2215         int rc;
2216
2217         if (unlikely(afinfo == NULL))
2218                 return -EINVAL;
2219
2220         s = kzalloc(sizeof(*s), GFP_KERNEL);
2221         if (!s)
2222                 return -ENOMEM;
2223         s->family               = afinfo->family;
2224         s->seq_ops.start        = tcp_seq_start;
2225         s->seq_ops.next         = tcp_seq_next;
2226         s->seq_ops.show         = afinfo->seq_show;
2227         s->seq_ops.stop         = tcp_seq_stop;
2228
2229         rc = seq_open(file, &s->seq_ops);
2230         if (rc)
2231                 goto out_kfree;
2232         seq          = file->private_data;
2233         seq->private = s;
2234 out:
2235         return rc;
2236 out_kfree:
2237         kfree(s);
2238         goto out;
2239 }
2240
2241 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2242 {
2243         int rc = 0;
2244         struct proc_dir_entry *p;
2245
2246         if (!afinfo)
2247                 return -EINVAL;
2248         afinfo->seq_fops->owner         = afinfo->owner;
2249         afinfo->seq_fops->open          = tcp_seq_open;
2250         afinfo->seq_fops->read          = seq_read;
2251         afinfo->seq_fops->llseek        = seq_lseek;
2252         afinfo->seq_fops->release       = seq_release_private;
2253
2254         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2255         if (p)
2256                 p->data = afinfo;
2257         else
2258                 rc = -ENOMEM;
2259         return rc;
2260 }
2261
2262 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2263 {
2264         if (!afinfo)
2265                 return;
2266         proc_net_remove(afinfo->name);
2267         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2268 }
2269
2270 static void get_openreq4(struct sock *sk, struct request_sock *req,
2271                          char *tmpbuf, int i, int uid)
2272 {
2273         const struct inet_request_sock *ireq = inet_rsk(req);
2274         int ttd = req->expires - jiffies;
2275
2276         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2277                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2278                 i,
2279                 ireq->loc_addr,
2280                 ntohs(inet_sk(sk)->sport),
2281                 ireq->rmt_addr,
2282                 ntohs(ireq->rmt_port),
2283                 TCP_SYN_RECV,
2284                 0, 0, /* could print option size, but that is af dependent. */
2285                 1,    /* timers active (only the expire timer) */
2286                 jiffies_to_clock_t(ttd),
2287                 req->retrans,
2288                 uid,
2289                 0,  /* non standard timer */
2290                 0, /* open_requests have no inode */
2291                 atomic_read(&sk->sk_refcnt),
2292                 req);
2293 }
2294
2295 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2296 {
2297         int timer_active;
2298         unsigned long timer_expires;
2299         struct tcp_sock *tp = tcp_sk(sp);
2300         const struct inet_connection_sock *icsk = inet_csk(sp);
2301         struct inet_sock *inet = inet_sk(sp);
2302         __be32 dest = inet->daddr;
2303         __be32 src = inet->rcv_saddr;
2304         __u16 destp = ntohs(inet->dport);
2305         __u16 srcp = ntohs(inet->sport);
2306
2307         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2308                 timer_active    = 1;
2309                 timer_expires   = icsk->icsk_timeout;
2310         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2311                 timer_active    = 4;
2312                 timer_expires   = icsk->icsk_timeout;
2313         } else if (timer_pending(&sp->sk_timer)) {
2314                 timer_active    = 2;
2315                 timer_expires   = sp->sk_timer.expires;
2316         } else {
2317                 timer_active    = 0;
2318                 timer_expires = jiffies;
2319         }
2320
2321         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2322                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2323                 i, src, srcp, dest, destp, sp->sk_state,
2324                 tp->write_seq - tp->snd_una,
2325                 sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog :
2326                                              (tp->rcv_nxt - tp->copied_seq),
2327                 timer_active,
2328                 jiffies_to_clock_t(timer_expires - jiffies),
2329                 icsk->icsk_retransmits,
2330                 sock_i_uid(sp),
2331                 icsk->icsk_probes_out,
2332                 sock_i_ino(sp),
2333                 atomic_read(&sp->sk_refcnt), sp,
2334                 icsk->icsk_rto,
2335                 icsk->icsk_ack.ato,
2336                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2337                 tp->snd_cwnd,
2338                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2339 }
2340
2341 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2342                                char *tmpbuf, int i)
2343 {
2344         __be32 dest, src;
2345         __u16 destp, srcp;
2346         int ttd = tw->tw_ttd - jiffies;
2347
2348         if (ttd < 0)
2349                 ttd = 0;
2350
2351         dest  = tw->tw_daddr;
2352         src   = tw->tw_rcv_saddr;
2353         destp = ntohs(tw->tw_dport);
2354         srcp  = ntohs(tw->tw_sport);
2355
2356         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2357                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2358                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2359                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2360                 atomic_read(&tw->tw_refcnt), tw);
2361 }
2362
2363 #define TMPSZ 150
2364
2365 static int tcp4_seq_show(struct seq_file *seq, void *v)
2366 {
2367         struct tcp_iter_state* st;
2368         char tmpbuf[TMPSZ + 1];
2369
2370         if (v == SEQ_START_TOKEN) {
2371                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2372                            "  sl  local_address rem_address   st tx_queue "
2373                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2374                            "inode");
2375                 goto out;
2376         }
2377         st = seq->private;
2378
2379         switch (st->state) {
2380         case TCP_SEQ_STATE_LISTENING:
2381         case TCP_SEQ_STATE_ESTABLISHED:
2382                 get_tcp4_sock(v, tmpbuf, st->num);
2383                 break;
2384         case TCP_SEQ_STATE_OPENREQ:
2385                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2386                 break;
2387         case TCP_SEQ_STATE_TIME_WAIT:
2388                 get_timewait4_sock(v, tmpbuf, st->num);
2389                 break;
2390         }
2391         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2392 out:
2393         return 0;
2394 }
2395
2396 static struct file_operations tcp4_seq_fops;
2397 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2398         .owner          = THIS_MODULE,
2399         .name           = "tcp",
2400         .family         = AF_INET,
2401         .seq_show       = tcp4_seq_show,
2402         .seq_fops       = &tcp4_seq_fops,
2403 };
2404
2405 int __init tcp4_proc_init(void)
2406 {
2407         return tcp_proc_register(&tcp4_seq_afinfo);
2408 }
2409
2410 void tcp4_proc_exit(void)
2411 {
2412         tcp_proc_unregister(&tcp4_seq_afinfo);
2413 }
2414 #endif /* CONFIG_PROC_FS */
2415
2416 struct proto tcp_prot = {
2417         .name                   = "TCP",
2418         .owner                  = THIS_MODULE,
2419         .close                  = tcp_close,
2420         .connect                = tcp_v4_connect,
2421         .disconnect             = tcp_disconnect,
2422         .accept                 = inet_csk_accept,
2423         .ioctl                  = tcp_ioctl,
2424         .init                   = tcp_v4_init_sock,
2425         .destroy                = tcp_v4_destroy_sock,
2426         .shutdown               = tcp_shutdown,
2427         .setsockopt             = tcp_setsockopt,
2428         .getsockopt             = tcp_getsockopt,
2429         .sendmsg                = tcp_sendmsg,
2430         .recvmsg                = tcp_recvmsg,
2431         .backlog_rcv            = tcp_v4_do_rcv,
2432         .hash                   = tcp_v4_hash,
2433         .unhash                 = tcp_unhash,
2434         .get_port               = tcp_v4_get_port,
2435         .enter_memory_pressure  = tcp_enter_memory_pressure,
2436         .sockets_allocated      = &tcp_sockets_allocated,
2437         .orphan_count           = &tcp_orphan_count,
2438         .memory_allocated       = &tcp_memory_allocated,
2439         .memory_pressure        = &tcp_memory_pressure,
2440         .sysctl_mem             = sysctl_tcp_mem,
2441         .sysctl_wmem            = sysctl_tcp_wmem,
2442         .sysctl_rmem            = sysctl_tcp_rmem,
2443         .max_header             = MAX_TCP_HEADER,
2444         .obj_size               = sizeof(struct tcp_sock),
2445         .twsk_prot              = &tcp_timewait_sock_ops,
2446         .rsk_prot               = &tcp_request_sock_ops,
2447 #ifdef CONFIG_COMPAT
2448         .compat_setsockopt      = compat_tcp_setsockopt,
2449         .compat_getsockopt      = compat_tcp_getsockopt,
2450 #endif
2451 };
2452
2453 void __init tcp_v4_init(struct net_proto_family *ops)
2454 {
2455         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2456                                      IPPROTO_TCP) < 0)
2457                 panic("Failed to create the TCP control socket.\n");
2458 }
2459
2460 EXPORT_SYMBOL(ipv4_specific);
2461 EXPORT_SYMBOL(tcp_hashinfo);
2462 EXPORT_SYMBOL(tcp_prot);
2463 EXPORT_SYMBOL(tcp_unhash);
2464 EXPORT_SYMBOL(tcp_v4_conn_request);
2465 EXPORT_SYMBOL(tcp_v4_connect);
2466 EXPORT_SYMBOL(tcp_v4_do_rcv);
2467 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2468 EXPORT_SYMBOL(tcp_v4_send_check);
2469 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2470
2471 #ifdef CONFIG_PROC_FS
2472 EXPORT_SYMBOL(tcp_proc_register);
2473 EXPORT_SYMBOL(tcp_proc_unregister);
2474 #endif
2475 EXPORT_SYMBOL(sysctl_local_port_range);
2476 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2477