cfq-iosched: don't pass in queue for cfq_arm_slice_timer()
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(skb->nh.iph->daddr,
129                                           skb->nh.iph->saddr,
130                                           skb->h.th->dest,
131                                           skb->h.th->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk, 1);
195         if (tmp < 0)
196                 return tmp;
197
198         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
199                 ip_rt_put(rt);
200                 return -ENETUNREACH;
201         }
202
203         if (!inet->opt || !inet->opt->srr)
204                 daddr = rt->rt_dst;
205
206         if (!inet->saddr)
207                 inet->saddr = rt->rt_src;
208         inet->rcv_saddr = inet->saddr;
209
210         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
211                 /* Reset inherited state */
212                 tp->rx_opt.ts_recent       = 0;
213                 tp->rx_opt.ts_recent_stamp = 0;
214                 tp->write_seq              = 0;
215         }
216
217         if (tcp_death_row.sysctl_tw_recycle &&
218             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
219                 struct inet_peer *peer = rt_get_peer(rt);
220                 /*
221                  * VJ's idea. We save last timestamp seen from
222                  * the destination in peer table, when entering state
223                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
224                  * when trying new connection.
225                  */
226                 if (peer != NULL &&
227                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
228                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
229                         tp->rx_opt.ts_recent = peer->tcp_ts;
230                 }
231         }
232
233         inet->dport = usin->sin_port;
234         inet->daddr = daddr;
235
236         inet_csk(sk)->icsk_ext_hdr_len = 0;
237         if (inet->opt)
238                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
239
240         tp->rx_opt.mss_clamp = 536;
241
242         /* Socket identity is still unknown (sport may be zero).
243          * However we set state to SYN-SENT and not releasing socket
244          * lock select source port, enter ourselves into the hash tables and
245          * complete initialization after this.
246          */
247         tcp_set_state(sk, TCP_SYN_SENT);
248         err = inet_hash_connect(&tcp_death_row, sk);
249         if (err)
250                 goto failure;
251
252         err = ip_route_newports(&rt, IPPROTO_TCP,
253                                 inet->sport, inet->dport, sk);
254         if (err)
255                 goto failure;
256
257         /* OK, now commit destination to socket.  */
258         sk->sk_gso_type = SKB_GSO_TCPV4;
259         sk_setup_caps(sk, &rt->u.dst);
260
261         if (!tp->write_seq)
262                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
263                                                            inet->daddr,
264                                                            inet->sport,
265                                                            usin->sin_port);
266
267         inet->id = tp->write_seq ^ jiffies;
268
269         err = tcp_connect(sk);
270         rt = NULL;
271         if (err)
272                 goto failure;
273
274         return 0;
275
276 failure:
277         /*
278          * This unhashes the socket and releases the local port,
279          * if necessary.
280          */
281         tcp_set_state(sk, TCP_CLOSE);
282         ip_rt_put(rt);
283         sk->sk_route_caps = 0;
284         inet->dport = 0;
285         return err;
286 }
287
288 /*
289  * This routine does path mtu discovery as defined in RFC1191.
290  */
291 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
292 {
293         struct dst_entry *dst;
294         struct inet_sock *inet = inet_sk(sk);
295
296         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
297          * send out by Linux are always <576bytes so they should go through
298          * unfragmented).
299          */
300         if (sk->sk_state == TCP_LISTEN)
301                 return;
302
303         /* We don't check in the destentry if pmtu discovery is forbidden
304          * on this route. We just assume that no packet_to_big packets
305          * are send back when pmtu discovery is not active.
306          * There is a small race when the user changes this flag in the
307          * route, but I think that's acceptable.
308          */
309         if ((dst = __sk_dst_check(sk, 0)) == NULL)
310                 return;
311
312         dst->ops->update_pmtu(dst, mtu);
313
314         /* Something is about to be wrong... Remember soft error
315          * for the case, if this connection will not able to recover.
316          */
317         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
318                 sk->sk_err_soft = EMSGSIZE;
319
320         mtu = dst_mtu(dst);
321
322         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
323             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
324                 tcp_sync_mss(sk, mtu);
325
326                 /* Resend the TCP packet because it's
327                  * clear that the old packet has been
328                  * dropped. This is the new "fast" path mtu
329                  * discovery.
330                  */
331                 tcp_simple_retransmit(sk);
332         } /* else let the usual retransmit timer handle it */
333 }
334
335 /*
336  * This routine is called by the ICMP module when it gets some
337  * sort of error condition.  If err < 0 then the socket should
338  * be closed and the error returned to the user.  If err > 0
339  * it's just the icmp type << 8 | icmp code.  After adjustment
340  * header points to the first 8 bytes of the tcp header.  We need
341  * to find the appropriate port.
342  *
343  * The locking strategy used here is very "optimistic". When
344  * someone else accesses the socket the ICMP is just dropped
345  * and for some paths there is no check at all.
346  * A more general error queue to queue errors for later handling
347  * is probably better.
348  *
349  */
350
351 void tcp_v4_err(struct sk_buff *skb, u32 info)
352 {
353         struct iphdr *iph = (struct iphdr *)skb->data;
354         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
355         struct tcp_sock *tp;
356         struct inet_sock *inet;
357         int type = skb->h.icmph->type;
358         int code = skb->h.icmph->code;
359         struct sock *sk;
360         __u32 seq;
361         int err;
362
363         if (skb->len < (iph->ihl << 2) + 8) {
364                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
365                 return;
366         }
367
368         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
369                          th->source, inet_iif(skb));
370         if (!sk) {
371                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
372                 return;
373         }
374         if (sk->sk_state == TCP_TIME_WAIT) {
375                 inet_twsk_put(inet_twsk(sk));
376                 return;
377         }
378
379         bh_lock_sock(sk);
380         /* If too many ICMPs get dropped on busy
381          * servers this needs to be solved differently.
382          */
383         if (sock_owned_by_user(sk))
384                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
385
386         if (sk->sk_state == TCP_CLOSE)
387                 goto out;
388
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 break;
416         case ICMP_TIME_EXCEEDED:
417                 err = EHOSTUNREACH;
418                 break;
419         default:
420                 goto out;
421         }
422
423         switch (sk->sk_state) {
424                 struct request_sock *req, **prev;
425         case TCP_LISTEN:
426                 if (sock_owned_by_user(sk))
427                         goto out;
428
429                 req = inet_csk_search_req(sk, &prev, th->dest,
430                                           iph->daddr, iph->saddr);
431                 if (!req)
432                         goto out;
433
434                 /* ICMPs are not backlogged, hence we cannot get
435                    an established socket here.
436                  */
437                 BUG_TRAP(!req->sk);
438
439                 if (seq != tcp_rsk(req)->snt_isn) {
440                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
441                         goto out;
442                 }
443
444                 /*
445                  * Still in SYN_RECV, just remove it silently.
446                  * There is no good way to pass the error to the newly
447                  * created socket, and POSIX does not want network
448                  * errors returned from accept().
449                  */
450                 inet_csk_reqsk_queue_drop(sk, req, prev);
451                 goto out;
452
453         case TCP_SYN_SENT:
454         case TCP_SYN_RECV:  /* Cannot happen.
455                                It can f.e. if SYNs crossed.
456                              */
457                 if (!sock_owned_by_user(sk)) {
458                         sk->sk_err = err;
459
460                         sk->sk_error_report(sk);
461
462                         tcp_done(sk);
463                 } else {
464                         sk->sk_err_soft = err;
465                 }
466                 goto out;
467         }
468
469         /* If we've already connected we will keep trying
470          * until we time out, or the user gives up.
471          *
472          * rfc1122 4.2.3.9 allows to consider as hard errors
473          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
474          * but it is obsoleted by pmtu discovery).
475          *
476          * Note, that in modern internet, where routing is unreliable
477          * and in each dark corner broken firewalls sit, sending random
478          * errors ordered by their masters even this two messages finally lose
479          * their original sense (even Linux sends invalid PORT_UNREACHs)
480          *
481          * Now we are in compliance with RFCs.
482          *                                                      --ANK (980905)
483          */
484
485         inet = inet_sk(sk);
486         if (!sock_owned_by_user(sk) && inet->recverr) {
487                 sk->sk_err = err;
488                 sk->sk_error_report(sk);
489         } else  { /* Only an error on timeout */
490                 sk->sk_err_soft = err;
491         }
492
493 out:
494         bh_unlock_sock(sk);
495         sock_put(sk);
496 }
497
498 /* This routine computes an IPv4 TCP checksum. */
499 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
500 {
501         struct inet_sock *inet = inet_sk(sk);
502         struct tcphdr *th = skb->h.th;
503
504         if (skb->ip_summed == CHECKSUM_PARTIAL) {
505                 th->check = ~tcp_v4_check(len, inet->saddr,
506                                           inet->daddr, 0);
507                 skb->csum_offset = offsetof(struct tcphdr, check);
508         } else {
509                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
510                                          csum_partial((char *)th,
511                                                       th->doff << 2,
512                                                       skb->csum));
513         }
514 }
515
516 int tcp_v4_gso_send_check(struct sk_buff *skb)
517 {
518         struct iphdr *iph;
519         struct tcphdr *th;
520
521         if (!pskb_may_pull(skb, sizeof(*th)))
522                 return -EINVAL;
523
524         iph = skb->nh.iph;
525         th = skb->h.th;
526
527         th->check = 0;
528         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
529         skb->csum_offset = offsetof(struct tcphdr, check);
530         skb->ip_summed = CHECKSUM_PARTIAL;
531         return 0;
532 }
533
534 /*
535  *      This routine will send an RST to the other tcp.
536  *
537  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
538  *                    for reset.
539  *      Answer: if a packet caused RST, it is not for a socket
540  *              existing in our system, if it is matched to a socket,
541  *              it is just duplicate segment or bug in other side's TCP.
542  *              So that we build reply only basing on parameters
543  *              arrived with segment.
544  *      Exception: precedence violation. We do not implement it in any case.
545  */
546
547 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
548 {
549         struct tcphdr *th = skb->h.th;
550         struct {
551                 struct tcphdr th;
552 #ifdef CONFIG_TCP_MD5SIG
553                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
554 #endif
555         } rep;
556         struct ip_reply_arg arg;
557 #ifdef CONFIG_TCP_MD5SIG
558         struct tcp_md5sig_key *key;
559 #endif
560
561         /* Never send a reset in response to a reset. */
562         if (th->rst)
563                 return;
564
565         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
566                 return;
567
568         /* Swap the send and the receive. */
569         memset(&rep, 0, sizeof(rep));
570         rep.th.dest   = th->source;
571         rep.th.source = th->dest;
572         rep.th.doff   = sizeof(struct tcphdr) / 4;
573         rep.th.rst    = 1;
574
575         if (th->ack) {
576                 rep.th.seq = th->ack_seq;
577         } else {
578                 rep.th.ack = 1;
579                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
580                                        skb->len - (th->doff << 2));
581         }
582
583         memset(&arg, 0, sizeof(arg));
584         arg.iov[0].iov_base = (unsigned char *)&rep;
585         arg.iov[0].iov_len  = sizeof(rep.th);
586
587 #ifdef CONFIG_TCP_MD5SIG
588         key = sk ? tcp_v4_md5_do_lookup(sk, skb->nh.iph->daddr) : NULL;
589         if (key) {
590                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
591                                    (TCPOPT_NOP << 16) |
592                                    (TCPOPT_MD5SIG << 8) |
593                                    TCPOLEN_MD5SIG);
594                 /* Update length and the length the header thinks exists */
595                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
596                 rep.th.doff = arg.iov[0].iov_len / 4;
597
598                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
599                                         key,
600                                         skb->nh.iph->daddr,
601                                         skb->nh.iph->saddr,
602                                         &rep.th, IPPROTO_TCP,
603                                         arg.iov[0].iov_len);
604         }
605 #endif
606         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
607                                       skb->nh.iph->saddr, /* XXX */
608                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
609         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
610
611         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
612
613         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
614         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
615 }
616
617 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
618    outside socket context is ugly, certainly. What can I do?
619  */
620
621 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
622                             struct sk_buff *skb, u32 seq, u32 ack,
623                             u32 win, u32 ts)
624 {
625         struct tcphdr *th = skb->h.th;
626         struct {
627                 struct tcphdr th;
628                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
629 #ifdef CONFIG_TCP_MD5SIG
630                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
631 #endif
632                         ];
633         } rep;
634         struct ip_reply_arg arg;
635 #ifdef CONFIG_TCP_MD5SIG
636         struct tcp_md5sig_key *key;
637         struct tcp_md5sig_key tw_key;
638 #endif
639
640         memset(&rep.th, 0, sizeof(struct tcphdr));
641         memset(&arg, 0, sizeof(arg));
642
643         arg.iov[0].iov_base = (unsigned char *)&rep;
644         arg.iov[0].iov_len  = sizeof(rep.th);
645         if (ts) {
646                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
647                                    (TCPOPT_TIMESTAMP << 8) |
648                                    TCPOLEN_TIMESTAMP);
649                 rep.opt[1] = htonl(tcp_time_stamp);
650                 rep.opt[2] = htonl(ts);
651                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
652         }
653
654         /* Swap the send and the receive. */
655         rep.th.dest    = th->source;
656         rep.th.source  = th->dest;
657         rep.th.doff    = arg.iov[0].iov_len / 4;
658         rep.th.seq     = htonl(seq);
659         rep.th.ack_seq = htonl(ack);
660         rep.th.ack     = 1;
661         rep.th.window  = htons(win);
662
663 #ifdef CONFIG_TCP_MD5SIG
664         /*
665          * The SKB holds an imcoming packet, but may not have a valid ->sk
666          * pointer. This is especially the case when we're dealing with a
667          * TIME_WAIT ack, because the sk structure is long gone, and only
668          * the tcp_timewait_sock remains. So the md5 key is stashed in that
669          * structure, and we use it in preference.  I believe that (twsk ||
670          * skb->sk) holds true, but we program defensively.
671          */
672         if (!twsk && skb->sk) {
673                 key = tcp_v4_md5_do_lookup(skb->sk, skb->nh.iph->daddr);
674         } else if (twsk && twsk->tw_md5_keylen) {
675                 tw_key.key = twsk->tw_md5_key;
676                 tw_key.keylen = twsk->tw_md5_keylen;
677                 key = &tw_key;
678         } else
679                 key = NULL;
680
681         if (key) {
682                 int offset = (ts) ? 3 : 0;
683
684                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
685                                           (TCPOPT_NOP << 16) |
686                                           (TCPOPT_MD5SIG << 8) |
687                                           TCPOLEN_MD5SIG);
688                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
689                 rep.th.doff = arg.iov[0].iov_len/4;
690
691                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
692                                         key,
693                                         skb->nh.iph->daddr,
694                                         skb->nh.iph->saddr,
695                                         &rep.th, IPPROTO_TCP,
696                                         arg.iov[0].iov_len);
697         }
698 #endif
699         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
700                                       skb->nh.iph->saddr, /* XXX */
701                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
702         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
703
704         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
705
706         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
707 }
708
709 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
710 {
711         struct inet_timewait_sock *tw = inet_twsk(sk);
712         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
713
714         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
715                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
716                         tcptw->tw_ts_recent);
717
718         inet_twsk_put(tw);
719 }
720
721 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
722                                   struct request_sock *req)
723 {
724         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
725                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
726                         req->ts_recent);
727 }
728
729 /*
730  *      Send a SYN-ACK after having received an ACK.
731  *      This still operates on a request_sock only, not on a big
732  *      socket.
733  */
734 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
735                               struct dst_entry *dst)
736 {
737         const struct inet_request_sock *ireq = inet_rsk(req);
738         int err = -1;
739         struct sk_buff * skb;
740
741         /* First, grab a route. */
742         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
743                 goto out;
744
745         skb = tcp_make_synack(sk, dst, req);
746
747         if (skb) {
748                 struct tcphdr *th = skb->h.th;
749
750                 th->check = tcp_v4_check(skb->len,
751                                          ireq->loc_addr,
752                                          ireq->rmt_addr,
753                                          csum_partial((char *)th, skb->len,
754                                                       skb->csum));
755
756                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
757                                             ireq->rmt_addr,
758                                             ireq->opt);
759                 err = net_xmit_eval(err);
760         }
761
762 out:
763         dst_release(dst);
764         return err;
765 }
766
767 /*
768  *      IPv4 request_sock destructor.
769  */
770 static void tcp_v4_reqsk_destructor(struct request_sock *req)
771 {
772         kfree(inet_rsk(req)->opt);
773 }
774
775 #ifdef CONFIG_SYN_COOKIES
776 static void syn_flood_warning(struct sk_buff *skb)
777 {
778         static unsigned long warntime;
779
780         if (time_after(jiffies, (warntime + HZ * 60))) {
781                 warntime = jiffies;
782                 printk(KERN_INFO
783                        "possible SYN flooding on port %d. Sending cookies.\n",
784                        ntohs(skb->h.th->dest));
785         }
786 }
787 #endif
788
789 /*
790  * Save and compile IPv4 options into the request_sock if needed.
791  */
792 static struct ip_options *tcp_v4_save_options(struct sock *sk,
793                                               struct sk_buff *skb)
794 {
795         struct ip_options *opt = &(IPCB(skb)->opt);
796         struct ip_options *dopt = NULL;
797
798         if (opt && opt->optlen) {
799                 int opt_size = optlength(opt);
800                 dopt = kmalloc(opt_size, GFP_ATOMIC);
801                 if (dopt) {
802                         if (ip_options_echo(dopt, skb)) {
803                                 kfree(dopt);
804                                 dopt = NULL;
805                         }
806                 }
807         }
808         return dopt;
809 }
810
811 #ifdef CONFIG_TCP_MD5SIG
812 /*
813  * RFC2385 MD5 checksumming requires a mapping of
814  * IP address->MD5 Key.
815  * We need to maintain these in the sk structure.
816  */
817
818 /* Find the Key structure for an address.  */
819 static struct tcp_md5sig_key *
820                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
821 {
822         struct tcp_sock *tp = tcp_sk(sk);
823         int i;
824
825         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
826                 return NULL;
827         for (i = 0; i < tp->md5sig_info->entries4; i++) {
828                 if (tp->md5sig_info->keys4[i].addr == addr)
829                         return (struct tcp_md5sig_key *)
830                                                 &tp->md5sig_info->keys4[i];
831         }
832         return NULL;
833 }
834
835 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
836                                          struct sock *addr_sk)
837 {
838         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
839 }
840
841 EXPORT_SYMBOL(tcp_v4_md5_lookup);
842
843 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
844                                                       struct request_sock *req)
845 {
846         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
847 }
848
849 /* This can be called on a newly created socket, from other files */
850 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
851                       u8 *newkey, u8 newkeylen)
852 {
853         /* Add Key to the list */
854         struct tcp4_md5sig_key *key;
855         struct tcp_sock *tp = tcp_sk(sk);
856         struct tcp4_md5sig_key *keys;
857
858         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
859         if (key) {
860                 /* Pre-existing entry - just update that one. */
861                 kfree(key->key);
862                 key->key = newkey;
863                 key->keylen = newkeylen;
864         } else {
865                 struct tcp_md5sig_info *md5sig;
866
867                 if (!tp->md5sig_info) {
868                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
869                                                   GFP_ATOMIC);
870                         if (!tp->md5sig_info) {
871                                 kfree(newkey);
872                                 return -ENOMEM;
873                         }
874                 }
875                 if (tcp_alloc_md5sig_pool() == NULL) {
876                         kfree(newkey);
877                         return -ENOMEM;
878                 }
879                 md5sig = tp->md5sig_info;
880
881                 if (md5sig->alloced4 == md5sig->entries4) {
882                         keys = kmalloc((sizeof(*keys) *
883                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
884                         if (!keys) {
885                                 kfree(newkey);
886                                 tcp_free_md5sig_pool();
887                                 return -ENOMEM;
888                         }
889
890                         if (md5sig->entries4)
891                                 memcpy(keys, md5sig->keys4,
892                                        sizeof(*keys) * md5sig->entries4);
893
894                         /* Free old key list, and reference new one */
895                         if (md5sig->keys4)
896                                 kfree(md5sig->keys4);
897                         md5sig->keys4 = keys;
898                         md5sig->alloced4++;
899                 }
900                 md5sig->entries4++;
901                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
902                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
903                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
904         }
905         return 0;
906 }
907
908 EXPORT_SYMBOL(tcp_v4_md5_do_add);
909
910 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
911                                u8 *newkey, u8 newkeylen)
912 {
913         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
914                                  newkey, newkeylen);
915 }
916
917 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
918 {
919         struct tcp_sock *tp = tcp_sk(sk);
920         int i;
921
922         for (i = 0; i < tp->md5sig_info->entries4; i++) {
923                 if (tp->md5sig_info->keys4[i].addr == addr) {
924                         /* Free the key */
925                         kfree(tp->md5sig_info->keys4[i].key);
926                         tp->md5sig_info->entries4--;
927
928                         if (tp->md5sig_info->entries4 == 0) {
929                                 kfree(tp->md5sig_info->keys4);
930                                 tp->md5sig_info->keys4 = NULL;
931                                 tp->md5sig_info->alloced4 = 0;
932                         } else if (tp->md5sig_info->entries4 != i) {
933                                 /* Need to do some manipulation */
934                                 memcpy(&tp->md5sig_info->keys4[i],
935                                        &tp->md5sig_info->keys4[i+1],
936                                        (tp->md5sig_info->entries4 - i) *
937                                         sizeof(struct tcp4_md5sig_key));
938                         }
939                         tcp_free_md5sig_pool();
940                         return 0;
941                 }
942         }
943         return -ENOENT;
944 }
945
946 EXPORT_SYMBOL(tcp_v4_md5_do_del);
947
948 static void tcp_v4_clear_md5_list(struct sock *sk)
949 {
950         struct tcp_sock *tp = tcp_sk(sk);
951
952         /* Free each key, then the set of key keys,
953          * the crypto element, and then decrement our
954          * hold on the last resort crypto.
955          */
956         if (tp->md5sig_info->entries4) {
957                 int i;
958                 for (i = 0; i < tp->md5sig_info->entries4; i++)
959                         kfree(tp->md5sig_info->keys4[i].key);
960                 tp->md5sig_info->entries4 = 0;
961                 tcp_free_md5sig_pool();
962         }
963         if (tp->md5sig_info->keys4) {
964                 kfree(tp->md5sig_info->keys4);
965                 tp->md5sig_info->keys4 = NULL;
966                 tp->md5sig_info->alloced4  = 0;
967         }
968 }
969
970 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
971                                  int optlen)
972 {
973         struct tcp_md5sig cmd;
974         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
975         u8 *newkey;
976
977         if (optlen < sizeof(cmd))
978                 return -EINVAL;
979
980         if (copy_from_user(&cmd, optval, sizeof(cmd)))
981                 return -EFAULT;
982
983         if (sin->sin_family != AF_INET)
984                 return -EINVAL;
985
986         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
987                 if (!tcp_sk(sk)->md5sig_info)
988                         return -ENOENT;
989                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
990         }
991
992         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
993                 return -EINVAL;
994
995         if (!tcp_sk(sk)->md5sig_info) {
996                 struct tcp_sock *tp = tcp_sk(sk);
997                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
998
999                 if (!p)
1000                         return -EINVAL;
1001
1002                 tp->md5sig_info = p;
1003
1004         }
1005
1006         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1007         if (!newkey)
1008                 return -ENOMEM;
1009         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1010                                  newkey, cmd.tcpm_keylen);
1011 }
1012
1013 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1014                                    __be32 saddr, __be32 daddr,
1015                                    struct tcphdr *th, int protocol,
1016                                    int tcplen)
1017 {
1018         struct scatterlist sg[4];
1019         __u16 data_len;
1020         int block = 0;
1021         __sum16 old_checksum;
1022         struct tcp_md5sig_pool *hp;
1023         struct tcp4_pseudohdr *bp;
1024         struct hash_desc *desc;
1025         int err;
1026         unsigned int nbytes = 0;
1027
1028         /*
1029          * Okay, so RFC2385 is turned on for this connection,
1030          * so we need to generate the MD5 hash for the packet now.
1031          */
1032
1033         hp = tcp_get_md5sig_pool();
1034         if (!hp)
1035                 goto clear_hash_noput;
1036
1037         bp = &hp->md5_blk.ip4;
1038         desc = &hp->md5_desc;
1039
1040         /*
1041          * 1. the TCP pseudo-header (in the order: source IP address,
1042          * destination IP address, zero-padded protocol number, and
1043          * segment length)
1044          */
1045         bp->saddr = saddr;
1046         bp->daddr = daddr;
1047         bp->pad = 0;
1048         bp->protocol = protocol;
1049         bp->len = htons(tcplen);
1050         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1051         nbytes += sizeof(*bp);
1052
1053         /* 2. the TCP header, excluding options, and assuming a
1054          * checksum of zero/
1055          */
1056         old_checksum = th->check;
1057         th->check = 0;
1058         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1059         nbytes += sizeof(struct tcphdr);
1060
1061         /* 3. the TCP segment data (if any) */
1062         data_len = tcplen - (th->doff << 2);
1063         if (data_len > 0) {
1064                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1065                 sg_set_buf(&sg[block++], data, data_len);
1066                 nbytes += data_len;
1067         }
1068
1069         /* 4. an independently-specified key or password, known to both
1070          * TCPs and presumably connection-specific
1071          */
1072         sg_set_buf(&sg[block++], key->key, key->keylen);
1073         nbytes += key->keylen;
1074
1075         /* Now store the Hash into the packet */
1076         err = crypto_hash_init(desc);
1077         if (err)
1078                 goto clear_hash;
1079         err = crypto_hash_update(desc, sg, nbytes);
1080         if (err)
1081                 goto clear_hash;
1082         err = crypto_hash_final(desc, md5_hash);
1083         if (err)
1084                 goto clear_hash;
1085
1086         /* Reset header, and free up the crypto */
1087         tcp_put_md5sig_pool();
1088         th->check = old_checksum;
1089
1090 out:
1091         return 0;
1092 clear_hash:
1093         tcp_put_md5sig_pool();
1094 clear_hash_noput:
1095         memset(md5_hash, 0, 16);
1096         goto out;
1097 }
1098
1099 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1100                          struct sock *sk,
1101                          struct dst_entry *dst,
1102                          struct request_sock *req,
1103                          struct tcphdr *th, int protocol,
1104                          int tcplen)
1105 {
1106         __be32 saddr, daddr;
1107
1108         if (sk) {
1109                 saddr = inet_sk(sk)->saddr;
1110                 daddr = inet_sk(sk)->daddr;
1111         } else {
1112                 struct rtable *rt = (struct rtable *)dst;
1113                 BUG_ON(!rt);
1114                 saddr = rt->rt_src;
1115                 daddr = rt->rt_dst;
1116         }
1117         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1118                                        saddr, daddr,
1119                                        th, protocol, tcplen);
1120 }
1121
1122 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1123
1124 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1125 {
1126         /*
1127          * This gets called for each TCP segment that arrives
1128          * so we want to be efficient.
1129          * We have 3 drop cases:
1130          * o No MD5 hash and one expected.
1131          * o MD5 hash and we're not expecting one.
1132          * o MD5 hash and its wrong.
1133          */
1134         __u8 *hash_location = NULL;
1135         struct tcp_md5sig_key *hash_expected;
1136         struct iphdr *iph = skb->nh.iph;
1137         struct tcphdr *th = skb->h.th;
1138         int length = (th->doff << 2) - sizeof(struct tcphdr);
1139         int genhash;
1140         unsigned char *ptr;
1141         unsigned char newhash[16];
1142
1143         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1144
1145         /*
1146          * If the TCP option length is less than the TCP_MD5SIG
1147          * option length, then we can shortcut
1148          */
1149         if (length < TCPOLEN_MD5SIG) {
1150                 if (hash_expected)
1151                         return 1;
1152                 else
1153                         return 0;
1154         }
1155
1156         /* Okay, we can't shortcut - we have to grub through the options */
1157         ptr = (unsigned char *)(th + 1);
1158         while (length > 0) {
1159                 int opcode = *ptr++;
1160                 int opsize;
1161
1162                 switch (opcode) {
1163                 case TCPOPT_EOL:
1164                         goto done_opts;
1165                 case TCPOPT_NOP:
1166                         length--;
1167                         continue;
1168                 default:
1169                         opsize = *ptr++;
1170                         if (opsize < 2)
1171                                 goto done_opts;
1172                         if (opsize > length)
1173                                 goto done_opts;
1174
1175                         if (opcode == TCPOPT_MD5SIG) {
1176                                 hash_location = ptr;
1177                                 goto done_opts;
1178                         }
1179                 }
1180                 ptr += opsize-2;
1181                 length -= opsize;
1182         }
1183 done_opts:
1184         /* We've parsed the options - do we have a hash? */
1185         if (!hash_expected && !hash_location)
1186                 return 0;
1187
1188         if (hash_expected && !hash_location) {
1189                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1190                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1191                                NIPQUAD(iph->saddr), ntohs(th->source),
1192                                NIPQUAD(iph->daddr), ntohs(th->dest));
1193                 return 1;
1194         }
1195
1196         if (!hash_expected && hash_location) {
1197                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1198                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1199                                NIPQUAD(iph->saddr), ntohs(th->source),
1200                                NIPQUAD(iph->daddr), ntohs(th->dest));
1201                 return 1;
1202         }
1203
1204         /* Okay, so this is hash_expected and hash_location -
1205          * so we need to calculate the checksum.
1206          */
1207         genhash = tcp_v4_do_calc_md5_hash(newhash,
1208                                           hash_expected,
1209                                           iph->saddr, iph->daddr,
1210                                           th, sk->sk_protocol,
1211                                           skb->len);
1212
1213         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1214                 if (net_ratelimit()) {
1215                         printk(KERN_INFO "MD5 Hash failed for "
1216                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1217                                NIPQUAD(iph->saddr), ntohs(th->source),
1218                                NIPQUAD(iph->daddr), ntohs(th->dest),
1219                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1220                 }
1221                 return 1;
1222         }
1223         return 0;
1224 }
1225
1226 #endif
1227
1228 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1229         .family         =       PF_INET,
1230         .obj_size       =       sizeof(struct tcp_request_sock),
1231         .rtx_syn_ack    =       tcp_v4_send_synack,
1232         .send_ack       =       tcp_v4_reqsk_send_ack,
1233         .destructor     =       tcp_v4_reqsk_destructor,
1234         .send_reset     =       tcp_v4_send_reset,
1235 };
1236
1237 #ifdef CONFIG_TCP_MD5SIG
1238 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1239         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1240 };
1241 #endif
1242
1243 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1244         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1245         .twsk_unique    = tcp_twsk_unique,
1246         .twsk_destructor= tcp_twsk_destructor,
1247 };
1248
1249 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1250 {
1251         struct inet_request_sock *ireq;
1252         struct tcp_options_received tmp_opt;
1253         struct request_sock *req;
1254         __be32 saddr = skb->nh.iph->saddr;
1255         __be32 daddr = skb->nh.iph->daddr;
1256         __u32 isn = TCP_SKB_CB(skb)->when;
1257         struct dst_entry *dst = NULL;
1258 #ifdef CONFIG_SYN_COOKIES
1259         int want_cookie = 0;
1260 #else
1261 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1262 #endif
1263
1264         /* Never answer to SYNs send to broadcast or multicast */
1265         if (((struct rtable *)skb->dst)->rt_flags &
1266             (RTCF_BROADCAST | RTCF_MULTICAST))
1267                 goto drop;
1268
1269         /* TW buckets are converted to open requests without
1270          * limitations, they conserve resources and peer is
1271          * evidently real one.
1272          */
1273         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1274 #ifdef CONFIG_SYN_COOKIES
1275                 if (sysctl_tcp_syncookies) {
1276                         want_cookie = 1;
1277                 } else
1278 #endif
1279                 goto drop;
1280         }
1281
1282         /* Accept backlog is full. If we have already queued enough
1283          * of warm entries in syn queue, drop request. It is better than
1284          * clogging syn queue with openreqs with exponentially increasing
1285          * timeout.
1286          */
1287         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1288                 goto drop;
1289
1290         req = reqsk_alloc(&tcp_request_sock_ops);
1291         if (!req)
1292                 goto drop;
1293
1294 #ifdef CONFIG_TCP_MD5SIG
1295         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1296 #endif
1297
1298         tcp_clear_options(&tmp_opt);
1299         tmp_opt.mss_clamp = 536;
1300         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1301
1302         tcp_parse_options(skb, &tmp_opt, 0);
1303
1304         if (want_cookie) {
1305                 tcp_clear_options(&tmp_opt);
1306                 tmp_opt.saw_tstamp = 0;
1307         }
1308
1309         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1310                 /* Some OSes (unknown ones, but I see them on web server, which
1311                  * contains information interesting only for windows'
1312                  * users) do not send their stamp in SYN. It is easy case.
1313                  * We simply do not advertise TS support.
1314                  */
1315                 tmp_opt.saw_tstamp = 0;
1316                 tmp_opt.tstamp_ok  = 0;
1317         }
1318         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1319
1320         tcp_openreq_init(req, &tmp_opt, skb);
1321
1322         if (security_inet_conn_request(sk, skb, req))
1323                 goto drop_and_free;
1324
1325         ireq = inet_rsk(req);
1326         ireq->loc_addr = daddr;
1327         ireq->rmt_addr = saddr;
1328         ireq->opt = tcp_v4_save_options(sk, skb);
1329         if (!want_cookie)
1330                 TCP_ECN_create_request(req, skb->h.th);
1331
1332         if (want_cookie) {
1333 #ifdef CONFIG_SYN_COOKIES
1334                 syn_flood_warning(skb);
1335 #endif
1336                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1337         } else if (!isn) {
1338                 struct inet_peer *peer = NULL;
1339
1340                 /* VJ's idea. We save last timestamp seen
1341                  * from the destination in peer table, when entering
1342                  * state TIME-WAIT, and check against it before
1343                  * accepting new connection request.
1344                  *
1345                  * If "isn" is not zero, this request hit alive
1346                  * timewait bucket, so that all the necessary checks
1347                  * are made in the function processing timewait state.
1348                  */
1349                 if (tmp_opt.saw_tstamp &&
1350                     tcp_death_row.sysctl_tw_recycle &&
1351                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1352                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1353                     peer->v4daddr == saddr) {
1354                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1355                             (s32)(peer->tcp_ts - req->ts_recent) >
1356                                                         TCP_PAWS_WINDOW) {
1357                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1358                                 dst_release(dst);
1359                                 goto drop_and_free;
1360                         }
1361                 }
1362                 /* Kill the following clause, if you dislike this way. */
1363                 else if (!sysctl_tcp_syncookies &&
1364                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1365                           (sysctl_max_syn_backlog >> 2)) &&
1366                          (!peer || !peer->tcp_ts_stamp) &&
1367                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1368                         /* Without syncookies last quarter of
1369                          * backlog is filled with destinations,
1370                          * proven to be alive.
1371                          * It means that we continue to communicate
1372                          * to destinations, already remembered
1373                          * to the moment of synflood.
1374                          */
1375                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1376                                        "request from %u.%u.%u.%u/%u\n",
1377                                        NIPQUAD(saddr),
1378                                        ntohs(skb->h.th->source));
1379                         dst_release(dst);
1380                         goto drop_and_free;
1381                 }
1382
1383                 isn = tcp_v4_init_sequence(skb);
1384         }
1385         tcp_rsk(req)->snt_isn = isn;
1386
1387         if (tcp_v4_send_synack(sk, req, dst))
1388                 goto drop_and_free;
1389
1390         if (want_cookie) {
1391                 reqsk_free(req);
1392         } else {
1393                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1394         }
1395         return 0;
1396
1397 drop_and_free:
1398         reqsk_free(req);
1399 drop:
1400         return 0;
1401 }
1402
1403
1404 /*
1405  * The three way handshake has completed - we got a valid synack -
1406  * now create the new socket.
1407  */
1408 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1409                                   struct request_sock *req,
1410                                   struct dst_entry *dst)
1411 {
1412         struct inet_request_sock *ireq;
1413         struct inet_sock *newinet;
1414         struct tcp_sock *newtp;
1415         struct sock *newsk;
1416 #ifdef CONFIG_TCP_MD5SIG
1417         struct tcp_md5sig_key *key;
1418 #endif
1419
1420         if (sk_acceptq_is_full(sk))
1421                 goto exit_overflow;
1422
1423         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1424                 goto exit;
1425
1426         newsk = tcp_create_openreq_child(sk, req, skb);
1427         if (!newsk)
1428                 goto exit;
1429
1430         newsk->sk_gso_type = SKB_GSO_TCPV4;
1431         sk_setup_caps(newsk, dst);
1432
1433         newtp                 = tcp_sk(newsk);
1434         newinet               = inet_sk(newsk);
1435         ireq                  = inet_rsk(req);
1436         newinet->daddr        = ireq->rmt_addr;
1437         newinet->rcv_saddr    = ireq->loc_addr;
1438         newinet->saddr        = ireq->loc_addr;
1439         newinet->opt          = ireq->opt;
1440         ireq->opt             = NULL;
1441         newinet->mc_index     = inet_iif(skb);
1442         newinet->mc_ttl       = skb->nh.iph->ttl;
1443         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444         if (newinet->opt)
1445                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1446         newinet->id = newtp->write_seq ^ jiffies;
1447
1448         tcp_mtup_init(newsk);
1449         tcp_sync_mss(newsk, dst_mtu(dst));
1450         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1451         tcp_initialize_rcv_mss(newsk);
1452
1453 #ifdef CONFIG_TCP_MD5SIG
1454         /* Copy over the MD5 key from the original socket */
1455         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1456                 /*
1457                  * We're using one, so create a matching key
1458                  * on the newsk structure. If we fail to get
1459                  * memory, then we end up not copying the key
1460                  * across. Shucks.
1461                  */
1462                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463                 if (newkey != NULL)
1464                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1465                                           newkey, key->keylen);
1466         }
1467 #endif
1468
1469         __inet_hash(&tcp_hashinfo, newsk, 0);
1470         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1471
1472         return newsk;
1473
1474 exit_overflow:
1475         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1476 exit:
1477         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1478         dst_release(dst);
1479         return NULL;
1480 }
1481
1482 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1483 {
1484         struct tcphdr *th = skb->h.th;
1485         struct iphdr *iph = skb->nh.iph;
1486         struct sock *nsk;
1487         struct request_sock **prev;
1488         /* Find possible connection requests. */
1489         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1490                                                        iph->saddr, iph->daddr);
1491         if (req)
1492                 return tcp_check_req(sk, skb, req, prev);
1493
1494         nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1495                                       th->source, skb->nh.iph->daddr,
1496                                       th->dest, inet_iif(skb));
1497
1498         if (nsk) {
1499                 if (nsk->sk_state != TCP_TIME_WAIT) {
1500                         bh_lock_sock(nsk);
1501                         return nsk;
1502                 }
1503                 inet_twsk_put(inet_twsk(nsk));
1504                 return NULL;
1505         }
1506
1507 #ifdef CONFIG_SYN_COOKIES
1508         if (!th->rst && !th->syn && th->ack)
1509                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1510 #endif
1511         return sk;
1512 }
1513
1514 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1515 {
1516         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1517                 if (!tcp_v4_check(skb->len, skb->nh.iph->saddr,
1518                                   skb->nh.iph->daddr, skb->csum)) {
1519                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1520                         return 0;
1521                 }
1522         }
1523
1524         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1525                                        skb->len, IPPROTO_TCP, 0);
1526
1527         if (skb->len <= 76) {
1528                 return __skb_checksum_complete(skb);
1529         }
1530         return 0;
1531 }
1532
1533
1534 /* The socket must have it's spinlock held when we get
1535  * here.
1536  *
1537  * We have a potential double-lock case here, so even when
1538  * doing backlog processing we use the BH locking scheme.
1539  * This is because we cannot sleep with the original spinlock
1540  * held.
1541  */
1542 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1543 {
1544         struct sock *rsk;
1545 #ifdef CONFIG_TCP_MD5SIG
1546         /*
1547          * We really want to reject the packet as early as possible
1548          * if:
1549          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1550          *  o There is an MD5 option and we're not expecting one
1551          */
1552         if (tcp_v4_inbound_md5_hash(sk, skb))
1553                 goto discard;
1554 #endif
1555
1556         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1557                 TCP_CHECK_TIMER(sk);
1558                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) {
1559                         rsk = sk;
1560                         goto reset;
1561                 }
1562                 TCP_CHECK_TIMER(sk);
1563                 return 0;
1564         }
1565
1566         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1567                 goto csum_err;
1568
1569         if (sk->sk_state == TCP_LISTEN) {
1570                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1571                 if (!nsk)
1572                         goto discard;
1573
1574                 if (nsk != sk) {
1575                         if (tcp_child_process(sk, nsk, skb)) {
1576                                 rsk = nsk;
1577                                 goto reset;
1578                         }
1579                         return 0;
1580                 }
1581         }
1582
1583         TCP_CHECK_TIMER(sk);
1584         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) {
1585                 rsk = sk;
1586                 goto reset;
1587         }
1588         TCP_CHECK_TIMER(sk);
1589         return 0;
1590
1591 reset:
1592         tcp_v4_send_reset(rsk, skb);
1593 discard:
1594         kfree_skb(skb);
1595         /* Be careful here. If this function gets more complicated and
1596          * gcc suffers from register pressure on the x86, sk (in %ebx)
1597          * might be destroyed here. This current version compiles correctly,
1598          * but you have been warned.
1599          */
1600         return 0;
1601
1602 csum_err:
1603         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1604         goto discard;
1605 }
1606
1607 /*
1608  *      From tcp_input.c
1609  */
1610
1611 int tcp_v4_rcv(struct sk_buff *skb)
1612 {
1613         struct tcphdr *th;
1614         struct sock *sk;
1615         int ret;
1616
1617         if (skb->pkt_type != PACKET_HOST)
1618                 goto discard_it;
1619
1620         /* Count it even if it's bad */
1621         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1622
1623         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1624                 goto discard_it;
1625
1626         th = skb->h.th;
1627
1628         if (th->doff < sizeof(struct tcphdr) / 4)
1629                 goto bad_packet;
1630         if (!pskb_may_pull(skb, th->doff * 4))
1631                 goto discard_it;
1632
1633         /* An explanation is required here, I think.
1634          * Packet length and doff are validated by header prediction,
1635          * provided case of th->doff==0 is eliminated.
1636          * So, we defer the checks. */
1637         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1638              tcp_v4_checksum_init(skb)))
1639                 goto bad_packet;
1640
1641         th = skb->h.th;
1642         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1643         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1644                                     skb->len - th->doff * 4);
1645         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1646         TCP_SKB_CB(skb)->when    = 0;
1647         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1648         TCP_SKB_CB(skb)->sacked  = 0;
1649
1650         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1651                            skb->nh.iph->daddr, th->dest,
1652                            inet_iif(skb));
1653
1654         if (!sk)
1655                 goto no_tcp_socket;
1656
1657 process:
1658         if (sk->sk_state == TCP_TIME_WAIT)
1659                 goto do_time_wait;
1660
1661         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1662                 goto discard_and_relse;
1663         nf_reset(skb);
1664
1665         if (sk_filter(sk, skb))
1666                 goto discard_and_relse;
1667
1668         skb->dev = NULL;
1669
1670         bh_lock_sock_nested(sk);
1671         ret = 0;
1672         if (!sock_owned_by_user(sk)) {
1673 #ifdef CONFIG_NET_DMA
1674                 struct tcp_sock *tp = tcp_sk(sk);
1675                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1676                         tp->ucopy.dma_chan = get_softnet_dma();
1677                 if (tp->ucopy.dma_chan)
1678                         ret = tcp_v4_do_rcv(sk, skb);
1679                 else
1680 #endif
1681                 {
1682                         if (!tcp_prequeue(sk, skb))
1683                         ret = tcp_v4_do_rcv(sk, skb);
1684                 }
1685         } else
1686                 sk_add_backlog(sk, skb);
1687         bh_unlock_sock(sk);
1688
1689         sock_put(sk);
1690
1691         return ret;
1692
1693 no_tcp_socket:
1694         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1695                 goto discard_it;
1696
1697         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1698 bad_packet:
1699                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1700         } else {
1701                 tcp_v4_send_reset(NULL, skb);
1702         }
1703
1704 discard_it:
1705         /* Discard frame. */
1706         kfree_skb(skb);
1707         return 0;
1708
1709 discard_and_relse:
1710         sock_put(sk);
1711         goto discard_it;
1712
1713 do_time_wait:
1714         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1715                 inet_twsk_put(inet_twsk(sk));
1716                 goto discard_it;
1717         }
1718
1719         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1720                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1721                 inet_twsk_put(inet_twsk(sk));
1722                 goto discard_it;
1723         }
1724         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1725         case TCP_TW_SYN: {
1726                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1727                                                         skb->nh.iph->daddr,
1728                                                         th->dest,
1729                                                         inet_iif(skb));
1730                 if (sk2) {
1731                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1732                         inet_twsk_put(inet_twsk(sk));
1733                         sk = sk2;
1734                         goto process;
1735                 }
1736                 /* Fall through to ACK */
1737         }
1738         case TCP_TW_ACK:
1739                 tcp_v4_timewait_ack(sk, skb);
1740                 break;
1741         case TCP_TW_RST:
1742                 goto no_tcp_socket;
1743         case TCP_TW_SUCCESS:;
1744         }
1745         goto discard_it;
1746 }
1747
1748 /* VJ's idea. Save last timestamp seen from this destination
1749  * and hold it at least for normal timewait interval to use for duplicate
1750  * segment detection in subsequent connections, before they enter synchronized
1751  * state.
1752  */
1753
1754 int tcp_v4_remember_stamp(struct sock *sk)
1755 {
1756         struct inet_sock *inet = inet_sk(sk);
1757         struct tcp_sock *tp = tcp_sk(sk);
1758         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1759         struct inet_peer *peer = NULL;
1760         int release_it = 0;
1761
1762         if (!rt || rt->rt_dst != inet->daddr) {
1763                 peer = inet_getpeer(inet->daddr, 1);
1764                 release_it = 1;
1765         } else {
1766                 if (!rt->peer)
1767                         rt_bind_peer(rt, 1);
1768                 peer = rt->peer;
1769         }
1770
1771         if (peer) {
1772                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1773                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1774                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1775                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1776                         peer->tcp_ts = tp->rx_opt.ts_recent;
1777                 }
1778                 if (release_it)
1779                         inet_putpeer(peer);
1780                 return 1;
1781         }
1782
1783         return 0;
1784 }
1785
1786 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1787 {
1788         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1789
1790         if (peer) {
1791                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1792
1793                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1794                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1795                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1796                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1797                         peer->tcp_ts       = tcptw->tw_ts_recent;
1798                 }
1799                 inet_putpeer(peer);
1800                 return 1;
1801         }
1802
1803         return 0;
1804 }
1805
1806 struct inet_connection_sock_af_ops ipv4_specific = {
1807         .queue_xmit        = ip_queue_xmit,
1808         .send_check        = tcp_v4_send_check,
1809         .rebuild_header    = inet_sk_rebuild_header,
1810         .conn_request      = tcp_v4_conn_request,
1811         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1812         .remember_stamp    = tcp_v4_remember_stamp,
1813         .net_header_len    = sizeof(struct iphdr),
1814         .setsockopt        = ip_setsockopt,
1815         .getsockopt        = ip_getsockopt,
1816         .addr2sockaddr     = inet_csk_addr2sockaddr,
1817         .sockaddr_len      = sizeof(struct sockaddr_in),
1818 #ifdef CONFIG_COMPAT
1819         .compat_setsockopt = compat_ip_setsockopt,
1820         .compat_getsockopt = compat_ip_getsockopt,
1821 #endif
1822 };
1823
1824 #ifdef CONFIG_TCP_MD5SIG
1825 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1826         .md5_lookup             = tcp_v4_md5_lookup,
1827         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1828         .md5_add                = tcp_v4_md5_add_func,
1829         .md5_parse              = tcp_v4_parse_md5_keys,
1830 };
1831 #endif
1832
1833 /* NOTE: A lot of things set to zero explicitly by call to
1834  *       sk_alloc() so need not be done here.
1835  */
1836 static int tcp_v4_init_sock(struct sock *sk)
1837 {
1838         struct inet_connection_sock *icsk = inet_csk(sk);
1839         struct tcp_sock *tp = tcp_sk(sk);
1840
1841         skb_queue_head_init(&tp->out_of_order_queue);
1842         tcp_init_xmit_timers(sk);
1843         tcp_prequeue_init(tp);
1844
1845         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1846         tp->mdev = TCP_TIMEOUT_INIT;
1847
1848         /* So many TCP implementations out there (incorrectly) count the
1849          * initial SYN frame in their delayed-ACK and congestion control
1850          * algorithms that we must have the following bandaid to talk
1851          * efficiently to them.  -DaveM
1852          */
1853         tp->snd_cwnd = 2;
1854
1855         /* See draft-stevens-tcpca-spec-01 for discussion of the
1856          * initialization of these values.
1857          */
1858         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1859         tp->snd_cwnd_clamp = ~0;
1860         tp->mss_cache = 536;
1861
1862         tp->reordering = sysctl_tcp_reordering;
1863         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1864
1865         sk->sk_state = TCP_CLOSE;
1866
1867         sk->sk_write_space = sk_stream_write_space;
1868         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1869
1870         icsk->icsk_af_ops = &ipv4_specific;
1871         icsk->icsk_sync_mss = tcp_sync_mss;
1872 #ifdef CONFIG_TCP_MD5SIG
1873         tp->af_specific = &tcp_sock_ipv4_specific;
1874 #endif
1875
1876         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1877         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1878
1879         atomic_inc(&tcp_sockets_allocated);
1880
1881         return 0;
1882 }
1883
1884 int tcp_v4_destroy_sock(struct sock *sk)
1885 {
1886         struct tcp_sock *tp = tcp_sk(sk);
1887
1888         tcp_clear_xmit_timers(sk);
1889
1890         tcp_cleanup_congestion_control(sk);
1891
1892         /* Cleanup up the write buffer. */
1893         sk_stream_writequeue_purge(sk);
1894
1895         /* Cleans up our, hopefully empty, out_of_order_queue. */
1896         __skb_queue_purge(&tp->out_of_order_queue);
1897
1898 #ifdef CONFIG_TCP_MD5SIG
1899         /* Clean up the MD5 key list, if any */
1900         if (tp->md5sig_info) {
1901                 tcp_v4_clear_md5_list(sk);
1902                 kfree(tp->md5sig_info);
1903                 tp->md5sig_info = NULL;
1904         }
1905 #endif
1906
1907 #ifdef CONFIG_NET_DMA
1908         /* Cleans up our sk_async_wait_queue */
1909         __skb_queue_purge(&sk->sk_async_wait_queue);
1910 #endif
1911
1912         /* Clean prequeue, it must be empty really */
1913         __skb_queue_purge(&tp->ucopy.prequeue);
1914
1915         /* Clean up a referenced TCP bind bucket. */
1916         if (inet_csk(sk)->icsk_bind_hash)
1917                 inet_put_port(&tcp_hashinfo, sk);
1918
1919         /*
1920          * If sendmsg cached page exists, toss it.
1921          */
1922         if (sk->sk_sndmsg_page) {
1923                 __free_page(sk->sk_sndmsg_page);
1924                 sk->sk_sndmsg_page = NULL;
1925         }
1926
1927         atomic_dec(&tcp_sockets_allocated);
1928
1929         return 0;
1930 }
1931
1932 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1933
1934 #ifdef CONFIG_PROC_FS
1935 /* Proc filesystem TCP sock list dumping. */
1936
1937 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1938 {
1939         return hlist_empty(head) ? NULL :
1940                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1941 }
1942
1943 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1944 {
1945         return tw->tw_node.next ?
1946                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1947 }
1948
1949 static void *listening_get_next(struct seq_file *seq, void *cur)
1950 {
1951         struct inet_connection_sock *icsk;
1952         struct hlist_node *node;
1953         struct sock *sk = cur;
1954         struct tcp_iter_state* st = seq->private;
1955
1956         if (!sk) {
1957                 st->bucket = 0;
1958                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1959                 goto get_sk;
1960         }
1961
1962         ++st->num;
1963
1964         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1965                 struct request_sock *req = cur;
1966
1967                 icsk = inet_csk(st->syn_wait_sk);
1968                 req = req->dl_next;
1969                 while (1) {
1970                         while (req) {
1971                                 if (req->rsk_ops->family == st->family) {
1972                                         cur = req;
1973                                         goto out;
1974                                 }
1975                                 req = req->dl_next;
1976                         }
1977                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1978                                 break;
1979 get_req:
1980                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1981                 }
1982                 sk        = sk_next(st->syn_wait_sk);
1983                 st->state = TCP_SEQ_STATE_LISTENING;
1984                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1985         } else {
1986                 icsk = inet_csk(sk);
1987                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1988                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1989                         goto start_req;
1990                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1991                 sk = sk_next(sk);
1992         }
1993 get_sk:
1994         sk_for_each_from(sk, node) {
1995                 if (sk->sk_family == st->family) {
1996                         cur = sk;
1997                         goto out;
1998                 }
1999                 icsk = inet_csk(sk);
2000                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2001                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2002 start_req:
2003                         st->uid         = sock_i_uid(sk);
2004                         st->syn_wait_sk = sk;
2005                         st->state       = TCP_SEQ_STATE_OPENREQ;
2006                         st->sbucket     = 0;
2007                         goto get_req;
2008                 }
2009                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2010         }
2011         if (++st->bucket < INET_LHTABLE_SIZE) {
2012                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2013                 goto get_sk;
2014         }
2015         cur = NULL;
2016 out:
2017         return cur;
2018 }
2019
2020 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2021 {
2022         void *rc = listening_get_next(seq, NULL);
2023
2024         while (rc && *pos) {
2025                 rc = listening_get_next(seq, rc);
2026                 --*pos;
2027         }
2028         return rc;
2029 }
2030
2031 static void *established_get_first(struct seq_file *seq)
2032 {
2033         struct tcp_iter_state* st = seq->private;
2034         void *rc = NULL;
2035
2036         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2037                 struct sock *sk;
2038                 struct hlist_node *node;
2039                 struct inet_timewait_sock *tw;
2040
2041                 /* We can reschedule _before_ having picked the target: */
2042                 cond_resched_softirq();
2043
2044                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2045                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2046                         if (sk->sk_family != st->family) {
2047                                 continue;
2048                         }
2049                         rc = sk;
2050                         goto out;
2051                 }
2052                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2053                 inet_twsk_for_each(tw, node,
2054                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2055                         if (tw->tw_family != st->family) {
2056                                 continue;
2057                         }
2058                         rc = tw;
2059                         goto out;
2060                 }
2061                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2062                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2063         }
2064 out:
2065         return rc;
2066 }
2067
2068 static void *established_get_next(struct seq_file *seq, void *cur)
2069 {
2070         struct sock *sk = cur;
2071         struct inet_timewait_sock *tw;
2072         struct hlist_node *node;
2073         struct tcp_iter_state* st = seq->private;
2074
2075         ++st->num;
2076
2077         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2078                 tw = cur;
2079                 tw = tw_next(tw);
2080 get_tw:
2081                 while (tw && tw->tw_family != st->family) {
2082                         tw = tw_next(tw);
2083                 }
2084                 if (tw) {
2085                         cur = tw;
2086                         goto out;
2087                 }
2088                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2089                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2090
2091                 /* We can reschedule between buckets: */
2092                 cond_resched_softirq();
2093
2094                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2095                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2096                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2097                 } else {
2098                         cur = NULL;
2099                         goto out;
2100                 }
2101         } else
2102                 sk = sk_next(sk);
2103
2104         sk_for_each_from(sk, node) {
2105                 if (sk->sk_family == st->family)
2106                         goto found;
2107         }
2108
2109         st->state = TCP_SEQ_STATE_TIME_WAIT;
2110         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2111         goto get_tw;
2112 found:
2113         cur = sk;
2114 out:
2115         return cur;
2116 }
2117
2118 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2119 {
2120         void *rc = established_get_first(seq);
2121
2122         while (rc && pos) {
2123                 rc = established_get_next(seq, rc);
2124                 --pos;
2125         }
2126         return rc;
2127 }
2128
2129 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2130 {
2131         void *rc;
2132         struct tcp_iter_state* st = seq->private;
2133
2134         inet_listen_lock(&tcp_hashinfo);
2135         st->state = TCP_SEQ_STATE_LISTENING;
2136         rc        = listening_get_idx(seq, &pos);
2137
2138         if (!rc) {
2139                 inet_listen_unlock(&tcp_hashinfo);
2140                 local_bh_disable();
2141                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2142                 rc        = established_get_idx(seq, pos);
2143         }
2144
2145         return rc;
2146 }
2147
2148 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2149 {
2150         struct tcp_iter_state* st = seq->private;
2151         st->state = TCP_SEQ_STATE_LISTENING;
2152         st->num = 0;
2153         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154 }
2155
2156 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2157 {
2158         void *rc = NULL;
2159         struct tcp_iter_state* st;
2160
2161         if (v == SEQ_START_TOKEN) {
2162                 rc = tcp_get_idx(seq, 0);
2163                 goto out;
2164         }
2165         st = seq->private;
2166
2167         switch (st->state) {
2168         case TCP_SEQ_STATE_OPENREQ:
2169         case TCP_SEQ_STATE_LISTENING:
2170                 rc = listening_get_next(seq, v);
2171                 if (!rc) {
2172                         inet_listen_unlock(&tcp_hashinfo);
2173                         local_bh_disable();
2174                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2175                         rc        = established_get_first(seq);
2176                 }
2177                 break;
2178         case TCP_SEQ_STATE_ESTABLISHED:
2179         case TCP_SEQ_STATE_TIME_WAIT:
2180                 rc = established_get_next(seq, v);
2181                 break;
2182         }
2183 out:
2184         ++*pos;
2185         return rc;
2186 }
2187
2188 static void tcp_seq_stop(struct seq_file *seq, void *v)
2189 {
2190         struct tcp_iter_state* st = seq->private;
2191
2192         switch (st->state) {
2193         case TCP_SEQ_STATE_OPENREQ:
2194                 if (v) {
2195                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2196                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2197                 }
2198         case TCP_SEQ_STATE_LISTENING:
2199                 if (v != SEQ_START_TOKEN)
2200                         inet_listen_unlock(&tcp_hashinfo);
2201                 break;
2202         case TCP_SEQ_STATE_TIME_WAIT:
2203         case TCP_SEQ_STATE_ESTABLISHED:
2204                 if (v)
2205                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2206                 local_bh_enable();
2207                 break;
2208         }
2209 }
2210
2211 static int tcp_seq_open(struct inode *inode, struct file *file)
2212 {
2213         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2214         struct seq_file *seq;
2215         struct tcp_iter_state *s;
2216         int rc;
2217
2218         if (unlikely(afinfo == NULL))
2219                 return -EINVAL;
2220
2221         s = kzalloc(sizeof(*s), GFP_KERNEL);
2222         if (!s)
2223                 return -ENOMEM;
2224         s->family               = afinfo->family;
2225         s->seq_ops.start        = tcp_seq_start;
2226         s->seq_ops.next         = tcp_seq_next;
2227         s->seq_ops.show         = afinfo->seq_show;
2228         s->seq_ops.stop         = tcp_seq_stop;
2229
2230         rc = seq_open(file, &s->seq_ops);
2231         if (rc)
2232                 goto out_kfree;
2233         seq          = file->private_data;
2234         seq->private = s;
2235 out:
2236         return rc;
2237 out_kfree:
2238         kfree(s);
2239         goto out;
2240 }
2241
2242 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2243 {
2244         int rc = 0;
2245         struct proc_dir_entry *p;
2246
2247         if (!afinfo)
2248                 return -EINVAL;
2249         afinfo->seq_fops->owner         = afinfo->owner;
2250         afinfo->seq_fops->open          = tcp_seq_open;
2251         afinfo->seq_fops->read          = seq_read;
2252         afinfo->seq_fops->llseek        = seq_lseek;
2253         afinfo->seq_fops->release       = seq_release_private;
2254
2255         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2256         if (p)
2257                 p->data = afinfo;
2258         else
2259                 rc = -ENOMEM;
2260         return rc;
2261 }
2262
2263 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2264 {
2265         if (!afinfo)
2266                 return;
2267         proc_net_remove(afinfo->name);
2268         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2269 }
2270
2271 static void get_openreq4(struct sock *sk, struct request_sock *req,
2272                          char *tmpbuf, int i, int uid)
2273 {
2274         const struct inet_request_sock *ireq = inet_rsk(req);
2275         int ttd = req->expires - jiffies;
2276
2277         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2278                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2279                 i,
2280                 ireq->loc_addr,
2281                 ntohs(inet_sk(sk)->sport),
2282                 ireq->rmt_addr,
2283                 ntohs(ireq->rmt_port),
2284                 TCP_SYN_RECV,
2285                 0, 0, /* could print option size, but that is af dependent. */
2286                 1,    /* timers active (only the expire timer) */
2287                 jiffies_to_clock_t(ttd),
2288                 req->retrans,
2289                 uid,
2290                 0,  /* non standard timer */
2291                 0, /* open_requests have no inode */
2292                 atomic_read(&sk->sk_refcnt),
2293                 req);
2294 }
2295
2296 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2297 {
2298         int timer_active;
2299         unsigned long timer_expires;
2300         struct tcp_sock *tp = tcp_sk(sp);
2301         const struct inet_connection_sock *icsk = inet_csk(sp);
2302         struct inet_sock *inet = inet_sk(sp);
2303         __be32 dest = inet->daddr;
2304         __be32 src = inet->rcv_saddr;
2305         __u16 destp = ntohs(inet->dport);
2306         __u16 srcp = ntohs(inet->sport);
2307
2308         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2309                 timer_active    = 1;
2310                 timer_expires   = icsk->icsk_timeout;
2311         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2312                 timer_active    = 4;
2313                 timer_expires   = icsk->icsk_timeout;
2314         } else if (timer_pending(&sp->sk_timer)) {
2315                 timer_active    = 2;
2316                 timer_expires   = sp->sk_timer.expires;
2317         } else {
2318                 timer_active    = 0;
2319                 timer_expires = jiffies;
2320         }
2321
2322         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2323                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2324                 i, src, srcp, dest, destp, sp->sk_state,
2325                 tp->write_seq - tp->snd_una,
2326                 sp->sk_state == TCP_LISTEN ? sp->sk_ack_backlog :
2327                                              (tp->rcv_nxt - tp->copied_seq),
2328                 timer_active,
2329                 jiffies_to_clock_t(timer_expires - jiffies),
2330                 icsk->icsk_retransmits,
2331                 sock_i_uid(sp),
2332                 icsk->icsk_probes_out,
2333                 sock_i_ino(sp),
2334                 atomic_read(&sp->sk_refcnt), sp,
2335                 icsk->icsk_rto,
2336                 icsk->icsk_ack.ato,
2337                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2338                 tp->snd_cwnd,
2339                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2340 }
2341
2342 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2343                                char *tmpbuf, int i)
2344 {
2345         __be32 dest, src;
2346         __u16 destp, srcp;
2347         int ttd = tw->tw_ttd - jiffies;
2348
2349         if (ttd < 0)
2350                 ttd = 0;
2351
2352         dest  = tw->tw_daddr;
2353         src   = tw->tw_rcv_saddr;
2354         destp = ntohs(tw->tw_dport);
2355         srcp  = ntohs(tw->tw_sport);
2356
2357         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2358                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2359                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2360                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2361                 atomic_read(&tw->tw_refcnt), tw);
2362 }
2363
2364 #define TMPSZ 150
2365
2366 static int tcp4_seq_show(struct seq_file *seq, void *v)
2367 {
2368         struct tcp_iter_state* st;
2369         char tmpbuf[TMPSZ + 1];
2370
2371         if (v == SEQ_START_TOKEN) {
2372                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2373                            "  sl  local_address rem_address   st tx_queue "
2374                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2375                            "inode");
2376                 goto out;
2377         }
2378         st = seq->private;
2379
2380         switch (st->state) {
2381         case TCP_SEQ_STATE_LISTENING:
2382         case TCP_SEQ_STATE_ESTABLISHED:
2383                 get_tcp4_sock(v, tmpbuf, st->num);
2384                 break;
2385         case TCP_SEQ_STATE_OPENREQ:
2386                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2387                 break;
2388         case TCP_SEQ_STATE_TIME_WAIT:
2389                 get_timewait4_sock(v, tmpbuf, st->num);
2390                 break;
2391         }
2392         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2393 out:
2394         return 0;
2395 }
2396
2397 static struct file_operations tcp4_seq_fops;
2398 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2399         .owner          = THIS_MODULE,
2400         .name           = "tcp",
2401         .family         = AF_INET,
2402         .seq_show       = tcp4_seq_show,
2403         .seq_fops       = &tcp4_seq_fops,
2404 };
2405
2406 int __init tcp4_proc_init(void)
2407 {
2408         return tcp_proc_register(&tcp4_seq_afinfo);
2409 }
2410
2411 void tcp4_proc_exit(void)
2412 {
2413         tcp_proc_unregister(&tcp4_seq_afinfo);
2414 }
2415 #endif /* CONFIG_PROC_FS */
2416
2417 struct proto tcp_prot = {
2418         .name                   = "TCP",
2419         .owner                  = THIS_MODULE,
2420         .close                  = tcp_close,
2421         .connect                = tcp_v4_connect,
2422         .disconnect             = tcp_disconnect,
2423         .accept                 = inet_csk_accept,
2424         .ioctl                  = tcp_ioctl,
2425         .init                   = tcp_v4_init_sock,
2426         .destroy                = tcp_v4_destroy_sock,
2427         .shutdown               = tcp_shutdown,
2428         .setsockopt             = tcp_setsockopt,
2429         .getsockopt             = tcp_getsockopt,
2430         .sendmsg                = tcp_sendmsg,
2431         .recvmsg                = tcp_recvmsg,
2432         .backlog_rcv            = tcp_v4_do_rcv,
2433         .hash                   = tcp_v4_hash,
2434         .unhash                 = tcp_unhash,
2435         .get_port               = tcp_v4_get_port,
2436         .enter_memory_pressure  = tcp_enter_memory_pressure,
2437         .sockets_allocated      = &tcp_sockets_allocated,
2438         .orphan_count           = &tcp_orphan_count,
2439         .memory_allocated       = &tcp_memory_allocated,
2440         .memory_pressure        = &tcp_memory_pressure,
2441         .sysctl_mem             = sysctl_tcp_mem,
2442         .sysctl_wmem            = sysctl_tcp_wmem,
2443         .sysctl_rmem            = sysctl_tcp_rmem,
2444         .max_header             = MAX_TCP_HEADER,
2445         .obj_size               = sizeof(struct tcp_sock),
2446         .twsk_prot              = &tcp_timewait_sock_ops,
2447         .rsk_prot               = &tcp_request_sock_ops,
2448 #ifdef CONFIG_COMPAT
2449         .compat_setsockopt      = compat_tcp_setsockopt,
2450         .compat_getsockopt      = compat_tcp_getsockopt,
2451 #endif
2452 };
2453
2454 void __init tcp_v4_init(struct net_proto_family *ops)
2455 {
2456         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2457                                      IPPROTO_TCP) < 0)
2458                 panic("Failed to create the TCP control socket.\n");
2459 }
2460
2461 EXPORT_SYMBOL(ipv4_specific);
2462 EXPORT_SYMBOL(tcp_hashinfo);
2463 EXPORT_SYMBOL(tcp_prot);
2464 EXPORT_SYMBOL(tcp_unhash);
2465 EXPORT_SYMBOL(tcp_v4_conn_request);
2466 EXPORT_SYMBOL(tcp_v4_connect);
2467 EXPORT_SYMBOL(tcp_v4_do_rcv);
2468 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2469 EXPORT_SYMBOL(tcp_v4_send_check);
2470 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2471
2472 #ifdef CONFIG_PROC_FS
2473 EXPORT_SYMBOL(tcp_proc_register);
2474 EXPORT_SYMBOL(tcp_proc_unregister);
2475 #endif
2476 EXPORT_SYMBOL(sysctl_local_port_range);
2477 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2478