Merge branch 'davem-next' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik...
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89                                                    __be32 addr);
90 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
91                                    __be32 saddr, __be32 daddr,
92                                    struct tcphdr *th, unsigned int tcplen);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97         return NULL;
98 }
99 #endif
100
101 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
102         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
103         .lhash_users = ATOMIC_INIT(0),
104         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
105 };
106
107 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
108 {
109         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
110                                           ip_hdr(skb)->saddr,
111                                           tcp_hdr(skb)->dest,
112                                           tcp_hdr(skb)->source);
113 }
114
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118         struct tcp_sock *tp = tcp_sk(sk);
119
120         /* With PAWS, it is safe from the viewpoint
121            of data integrity. Even without PAWS it is safe provided sequence
122            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
123
124            Actually, the idea is close to VJ's one, only timestamp cache is
125            held not per host, but per port pair and TW bucket is used as state
126            holder.
127
128            If TW bucket has been already destroyed we fall back to VJ's scheme
129            and use initial timestamp retrieved from peer table.
130          */
131         if (tcptw->tw_ts_recent_stamp &&
132             (twp == NULL || (sysctl_tcp_tw_reuse &&
133                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
134                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
135                 if (tp->write_seq == 0)
136                         tp->write_seq = 1;
137                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
138                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
139                 sock_hold(sktw);
140                 return 1;
141         }
142
143         return 0;
144 }
145
146 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
147
148 /* This will initiate an outgoing connection. */
149 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
150 {
151         struct inet_sock *inet = inet_sk(sk);
152         struct tcp_sock *tp = tcp_sk(sk);
153         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
154         struct rtable *rt;
155         __be32 daddr, nexthop;
156         int tmp;
157         int err;
158
159         if (addr_len < sizeof(struct sockaddr_in))
160                 return -EINVAL;
161
162         if (usin->sin_family != AF_INET)
163                 return -EAFNOSUPPORT;
164
165         nexthop = daddr = usin->sin_addr.s_addr;
166         if (inet->opt && inet->opt->srr) {
167                 if (!daddr)
168                         return -EINVAL;
169                 nexthop = inet->opt->faddr;
170         }
171
172         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
173                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174                                IPPROTO_TCP,
175                                inet->sport, usin->sin_port, sk, 1);
176         if (tmp < 0) {
177                 if (tmp == -ENETUNREACH)
178                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
179                 return tmp;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet->opt || !inet->opt->srr)
188                 daddr = rt->rt_dst;
189
190         if (!inet->saddr)
191                 inet->saddr = rt->rt_src;
192         inet->rcv_saddr = inet->saddr;
193
194         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 tp->write_seq              = 0;
199         }
200
201         if (tcp_death_row.sysctl_tw_recycle &&
202             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
203                 struct inet_peer *peer = rt_get_peer(rt);
204                 /*
205                  * VJ's idea. We save last timestamp seen from
206                  * the destination in peer table, when entering state
207                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
208                  * when trying new connection.
209                  */
210                 if (peer != NULL &&
211                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
212                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
213                         tp->rx_opt.ts_recent = peer->tcp_ts;
214                 }
215         }
216
217         inet->dport = usin->sin_port;
218         inet->daddr = daddr;
219
220         inet_csk(sk)->icsk_ext_hdr_len = 0;
221         if (inet->opt)
222                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
223
224         tp->rx_opt.mss_clamp = 536;
225
226         /* Socket identity is still unknown (sport may be zero).
227          * However we set state to SYN-SENT and not releasing socket
228          * lock select source port, enter ourselves into the hash tables and
229          * complete initialization after this.
230          */
231         tcp_set_state(sk, TCP_SYN_SENT);
232         err = inet_hash_connect(&tcp_death_row, sk);
233         if (err)
234                 goto failure;
235
236         err = ip_route_newports(&rt, IPPROTO_TCP,
237                                 inet->sport, inet->dport, sk);
238         if (err)
239                 goto failure;
240
241         /* OK, now commit destination to socket.  */
242         sk->sk_gso_type = SKB_GSO_TCPV4;
243         sk_setup_caps(sk, &rt->u.dst);
244
245         if (!tp->write_seq)
246                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
247                                                            inet->daddr,
248                                                            inet->sport,
249                                                            usin->sin_port);
250
251         inet->id = tp->write_seq ^ jiffies;
252
253         err = tcp_connect(sk);
254         rt = NULL;
255         if (err)
256                 goto failure;
257
258         return 0;
259
260 failure:
261         /*
262          * This unhashes the socket and releases the local port,
263          * if necessary.
264          */
265         tcp_set_state(sk, TCP_CLOSE);
266         ip_rt_put(rt);
267         sk->sk_route_caps = 0;
268         inet->dport = 0;
269         return err;
270 }
271
272 /*
273  * This routine does path mtu discovery as defined in RFC1191.
274  */
275 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
276 {
277         struct dst_entry *dst;
278         struct inet_sock *inet = inet_sk(sk);
279
280         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
281          * send out by Linux are always <576bytes so they should go through
282          * unfragmented).
283          */
284         if (sk->sk_state == TCP_LISTEN)
285                 return;
286
287         /* We don't check in the destentry if pmtu discovery is forbidden
288          * on this route. We just assume that no packet_to_big packets
289          * are send back when pmtu discovery is not active.
290          * There is a small race when the user changes this flag in the
291          * route, but I think that's acceptable.
292          */
293         if ((dst = __sk_dst_check(sk, 0)) == NULL)
294                 return;
295
296         dst->ops->update_pmtu(dst, mtu);
297
298         /* Something is about to be wrong... Remember soft error
299          * for the case, if this connection will not able to recover.
300          */
301         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
302                 sk->sk_err_soft = EMSGSIZE;
303
304         mtu = dst_mtu(dst);
305
306         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
307             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
308                 tcp_sync_mss(sk, mtu);
309
310                 /* Resend the TCP packet because it's
311                  * clear that the old packet has been
312                  * dropped. This is the new "fast" path mtu
313                  * discovery.
314                  */
315                 tcp_simple_retransmit(sk);
316         } /* else let the usual retransmit timer handle it */
317 }
318
319 /*
320  * This routine is called by the ICMP module when it gets some
321  * sort of error condition.  If err < 0 then the socket should
322  * be closed and the error returned to the user.  If err > 0
323  * it's just the icmp type << 8 | icmp code.  After adjustment
324  * header points to the first 8 bytes of the tcp header.  We need
325  * to find the appropriate port.
326  *
327  * The locking strategy used here is very "optimistic". When
328  * someone else accesses the socket the ICMP is just dropped
329  * and for some paths there is no check at all.
330  * A more general error queue to queue errors for later handling
331  * is probably better.
332  *
333  */
334
335 void tcp_v4_err(struct sk_buff *skb, u32 info)
336 {
337         struct iphdr *iph = (struct iphdr *)skb->data;
338         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
339         struct tcp_sock *tp;
340         struct inet_sock *inet;
341         const int type = icmp_hdr(skb)->type;
342         const int code = icmp_hdr(skb)->code;
343         struct sock *sk;
344         __u32 seq;
345         int err;
346
347         if (skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         tp = tcp_sk(sk);
374         seq = ntohl(th->seq);
375         if (sk->sk_state != TCP_LISTEN &&
376             !between(seq, tp->snd_una, tp->snd_nxt)) {
377                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
378                 goto out;
379         }
380
381         switch (type) {
382         case ICMP_SOURCE_QUENCH:
383                 /* Just silently ignore these. */
384                 goto out;
385         case ICMP_PARAMETERPROB:
386                 err = EPROTO;
387                 break;
388         case ICMP_DEST_UNREACH:
389                 if (code > NR_ICMP_UNREACH)
390                         goto out;
391
392                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393                         if (!sock_owned_by_user(sk))
394                                 do_pmtu_discovery(sk, iph, info);
395                         goto out;
396                 }
397
398                 err = icmp_err_convert[code].errno;
399                 break;
400         case ICMP_TIME_EXCEEDED:
401                 err = EHOSTUNREACH;
402                 break;
403         default:
404                 goto out;
405         }
406
407         switch (sk->sk_state) {
408                 struct request_sock *req, **prev;
409         case TCP_LISTEN:
410                 if (sock_owned_by_user(sk))
411                         goto out;
412
413                 req = inet_csk_search_req(sk, &prev, th->dest,
414                                           iph->daddr, iph->saddr);
415                 if (!req)
416                         goto out;
417
418                 /* ICMPs are not backlogged, hence we cannot get
419                    an established socket here.
420                  */
421                 BUG_TRAP(!req->sk);
422
423                 if (seq != tcp_rsk(req)->snt_isn) {
424                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
425                         goto out;
426                 }
427
428                 /*
429                  * Still in SYN_RECV, just remove it silently.
430                  * There is no good way to pass the error to the newly
431                  * created socket, and POSIX does not want network
432                  * errors returned from accept().
433                  */
434                 inet_csk_reqsk_queue_drop(sk, req, prev);
435                 goto out;
436
437         case TCP_SYN_SENT:
438         case TCP_SYN_RECV:  /* Cannot happen.
439                                It can f.e. if SYNs crossed.
440                              */
441                 if (!sock_owned_by_user(sk)) {
442                         sk->sk_err = err;
443
444                         sk->sk_error_report(sk);
445
446                         tcp_done(sk);
447                 } else {
448                         sk->sk_err_soft = err;
449                 }
450                 goto out;
451         }
452
453         /* If we've already connected we will keep trying
454          * until we time out, or the user gives up.
455          *
456          * rfc1122 4.2.3.9 allows to consider as hard errors
457          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458          * but it is obsoleted by pmtu discovery).
459          *
460          * Note, that in modern internet, where routing is unreliable
461          * and in each dark corner broken firewalls sit, sending random
462          * errors ordered by their masters even this two messages finally lose
463          * their original sense (even Linux sends invalid PORT_UNREACHs)
464          *
465          * Now we are in compliance with RFCs.
466          *                                                      --ANK (980905)
467          */
468
469         inet = inet_sk(sk);
470         if (!sock_owned_by_user(sk) && inet->recverr) {
471                 sk->sk_err = err;
472                 sk->sk_error_report(sk);
473         } else  { /* Only an error on timeout */
474                 sk->sk_err_soft = err;
475         }
476
477 out:
478         bh_unlock_sock(sk);
479         sock_put(sk);
480 }
481
482 /* This routine computes an IPv4 TCP checksum. */
483 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
484 {
485         struct inet_sock *inet = inet_sk(sk);
486         struct tcphdr *th = tcp_hdr(skb);
487
488         if (skb->ip_summed == CHECKSUM_PARTIAL) {
489                 th->check = ~tcp_v4_check(len, inet->saddr,
490                                           inet->daddr, 0);
491                 skb->csum_start = skb_transport_header(skb) - skb->head;
492                 skb->csum_offset = offsetof(struct tcphdr, check);
493         } else {
494                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495                                          csum_partial((char *)th,
496                                                       th->doff << 2,
497                                                       skb->csum));
498         }
499 }
500
501 int tcp_v4_gso_send_check(struct sk_buff *skb)
502 {
503         const struct iphdr *iph;
504         struct tcphdr *th;
505
506         if (!pskb_may_pull(skb, sizeof(*th)))
507                 return -EINVAL;
508
509         iph = ip_hdr(skb);
510         th = tcp_hdr(skb);
511
512         th->check = 0;
513         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514         skb->csum_start = skb_transport_header(skb) - skb->head;
515         skb->csum_offset = offsetof(struct tcphdr, check);
516         skb->ip_summed = CHECKSUM_PARTIAL;
517         return 0;
518 }
519
520 /*
521  *      This routine will send an RST to the other tcp.
522  *
523  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524  *                    for reset.
525  *      Answer: if a packet caused RST, it is not for a socket
526  *              existing in our system, if it is matched to a socket,
527  *              it is just duplicate segment or bug in other side's TCP.
528  *              So that we build reply only basing on parameters
529  *              arrived with segment.
530  *      Exception: precedence violation. We do not implement it in any case.
531  */
532
533 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
534 {
535         struct tcphdr *th = tcp_hdr(skb);
536         struct {
537                 struct tcphdr th;
538 #ifdef CONFIG_TCP_MD5SIG
539                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
540 #endif
541         } rep;
542         struct ip_reply_arg arg;
543 #ifdef CONFIG_TCP_MD5SIG
544         struct tcp_md5sig_key *key;
545 #endif
546
547         /* Never send a reset in response to a reset. */
548         if (th->rst)
549                 return;
550
551         if (skb->rtable->rt_type != RTN_LOCAL)
552                 return;
553
554         /* Swap the send and the receive. */
555         memset(&rep, 0, sizeof(rep));
556         rep.th.dest   = th->source;
557         rep.th.source = th->dest;
558         rep.th.doff   = sizeof(struct tcphdr) / 4;
559         rep.th.rst    = 1;
560
561         if (th->ack) {
562                 rep.th.seq = th->ack_seq;
563         } else {
564                 rep.th.ack = 1;
565                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
566                                        skb->len - (th->doff << 2));
567         }
568
569         memset(&arg, 0, sizeof(arg));
570         arg.iov[0].iov_base = (unsigned char *)&rep;
571         arg.iov[0].iov_len  = sizeof(rep.th);
572
573 #ifdef CONFIG_TCP_MD5SIG
574         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
575         if (key) {
576                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
577                                    (TCPOPT_NOP << 16) |
578                                    (TCPOPT_MD5SIG << 8) |
579                                    TCPOLEN_MD5SIG);
580                 /* Update length and the length the header thinks exists */
581                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
582                 rep.th.doff = arg.iov[0].iov_len / 4;
583
584                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
585                                         key,
586                                         ip_hdr(skb)->daddr,
587                                         ip_hdr(skb)->saddr,
588                                         &rep.th, arg.iov[0].iov_len);
589         }
590 #endif
591         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
592                                       ip_hdr(skb)->saddr, /* XXX */
593                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
594         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
595
596         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
597                       &arg, arg.iov[0].iov_len);
598
599         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
600         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
601 }
602
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604    outside socket context is ugly, certainly. What can I do?
605  */
606
607 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608                             u32 win, u32 ts, int oif,
609                             struct tcp_md5sig_key *key)
610 {
611         struct tcphdr *th = tcp_hdr(skb);
612         struct {
613                 struct tcphdr th;
614                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617 #endif
618                         ];
619         } rep;
620         struct ip_reply_arg arg;
621
622         memset(&rep.th, 0, sizeof(struct tcphdr));
623         memset(&arg, 0, sizeof(arg));
624
625         arg.iov[0].iov_base = (unsigned char *)&rep;
626         arg.iov[0].iov_len  = sizeof(rep.th);
627         if (ts) {
628                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
629                                    (TCPOPT_TIMESTAMP << 8) |
630                                    TCPOLEN_TIMESTAMP);
631                 rep.opt[1] = htonl(tcp_time_stamp);
632                 rep.opt[2] = htonl(ts);
633                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
634         }
635
636         /* Swap the send and the receive. */
637         rep.th.dest    = th->source;
638         rep.th.source  = th->dest;
639         rep.th.doff    = arg.iov[0].iov_len / 4;
640         rep.th.seq     = htonl(seq);
641         rep.th.ack_seq = htonl(ack);
642         rep.th.ack     = 1;
643         rep.th.window  = htons(win);
644
645 #ifdef CONFIG_TCP_MD5SIG
646         if (key) {
647                 int offset = (ts) ? 3 : 0;
648
649                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
650                                           (TCPOPT_NOP << 16) |
651                                           (TCPOPT_MD5SIG << 8) |
652                                           TCPOLEN_MD5SIG);
653                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
654                 rep.th.doff = arg.iov[0].iov_len/4;
655
656                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
657                                         key,
658                                         ip_hdr(skb)->daddr,
659                                         ip_hdr(skb)->saddr,
660                                         &rep.th, arg.iov[0].iov_len);
661         }
662 #endif
663         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
664                                       ip_hdr(skb)->saddr, /* XXX */
665                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
666         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
667         if (oif)
668                 arg.bound_dev_if = oif;
669
670         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
671                       &arg, arg.iov[0].iov_len);
672
673         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
674 }
675
676 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
677 {
678         struct inet_timewait_sock *tw = inet_twsk(sk);
679         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
680
681         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
682                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
683                         tcptw->tw_ts_recent,
684                         tw->tw_bound_dev_if,
685                         tcp_twsk_md5_key(tcptw)
686                         );
687
688         inet_twsk_put(tw);
689 }
690
691 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
692                                   struct request_sock *req)
693 {
694         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
695                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
696                         req->ts_recent,
697                         0,
698                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
699 }
700
701 /*
702  *      Send a SYN-ACK after having received a SYN.
703  *      This still operates on a request_sock only, not on a big
704  *      socket.
705  */
706 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
707                                 struct dst_entry *dst)
708 {
709         const struct inet_request_sock *ireq = inet_rsk(req);
710         int err = -1;
711         struct sk_buff * skb;
712
713         /* First, grab a route. */
714         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
715                 return -1;
716
717         skb = tcp_make_synack(sk, dst, req);
718
719         if (skb) {
720                 struct tcphdr *th = tcp_hdr(skb);
721
722                 th->check = tcp_v4_check(skb->len,
723                                          ireq->loc_addr,
724                                          ireq->rmt_addr,
725                                          csum_partial((char *)th, skb->len,
726                                                       skb->csum));
727
728                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
729                                             ireq->rmt_addr,
730                                             ireq->opt);
731                 err = net_xmit_eval(err);
732         }
733
734         dst_release(dst);
735         return err;
736 }
737
738 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
739 {
740         return __tcp_v4_send_synack(sk, req, NULL);
741 }
742
743 /*
744  *      IPv4 request_sock destructor.
745  */
746 static void tcp_v4_reqsk_destructor(struct request_sock *req)
747 {
748         kfree(inet_rsk(req)->opt);
749 }
750
751 #ifdef CONFIG_SYN_COOKIES
752 static void syn_flood_warning(struct sk_buff *skb)
753 {
754         static unsigned long warntime;
755
756         if (time_after(jiffies, (warntime + HZ * 60))) {
757                 warntime = jiffies;
758                 printk(KERN_INFO
759                        "possible SYN flooding on port %d. Sending cookies.\n",
760                        ntohs(tcp_hdr(skb)->dest));
761         }
762 }
763 #endif
764
765 /*
766  * Save and compile IPv4 options into the request_sock if needed.
767  */
768 static struct ip_options *tcp_v4_save_options(struct sock *sk,
769                                               struct sk_buff *skb)
770 {
771         struct ip_options *opt = &(IPCB(skb)->opt);
772         struct ip_options *dopt = NULL;
773
774         if (opt && opt->optlen) {
775                 int opt_size = optlength(opt);
776                 dopt = kmalloc(opt_size, GFP_ATOMIC);
777                 if (dopt) {
778                         if (ip_options_echo(dopt, skb)) {
779                                 kfree(dopt);
780                                 dopt = NULL;
781                         }
782                 }
783         }
784         return dopt;
785 }
786
787 #ifdef CONFIG_TCP_MD5SIG
788 /*
789  * RFC2385 MD5 checksumming requires a mapping of
790  * IP address->MD5 Key.
791  * We need to maintain these in the sk structure.
792  */
793
794 /* Find the Key structure for an address.  */
795 static struct tcp_md5sig_key *
796                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
797 {
798         struct tcp_sock *tp = tcp_sk(sk);
799         int i;
800
801         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
802                 return NULL;
803         for (i = 0; i < tp->md5sig_info->entries4; i++) {
804                 if (tp->md5sig_info->keys4[i].addr == addr)
805                         return &tp->md5sig_info->keys4[i].base;
806         }
807         return NULL;
808 }
809
810 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
811                                          struct sock *addr_sk)
812 {
813         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
814 }
815
816 EXPORT_SYMBOL(tcp_v4_md5_lookup);
817
818 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
819                                                       struct request_sock *req)
820 {
821         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
822 }
823
824 /* This can be called on a newly created socket, from other files */
825 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
826                       u8 *newkey, u8 newkeylen)
827 {
828         /* Add Key to the list */
829         struct tcp_md5sig_key *key;
830         struct tcp_sock *tp = tcp_sk(sk);
831         struct tcp4_md5sig_key *keys;
832
833         key = tcp_v4_md5_do_lookup(sk, addr);
834         if (key) {
835                 /* Pre-existing entry - just update that one. */
836                 kfree(key->key);
837                 key->key = newkey;
838                 key->keylen = newkeylen;
839         } else {
840                 struct tcp_md5sig_info *md5sig;
841
842                 if (!tp->md5sig_info) {
843                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
844                                                   GFP_ATOMIC);
845                         if (!tp->md5sig_info) {
846                                 kfree(newkey);
847                                 return -ENOMEM;
848                         }
849                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
850                 }
851                 if (tcp_alloc_md5sig_pool() == NULL) {
852                         kfree(newkey);
853                         return -ENOMEM;
854                 }
855                 md5sig = tp->md5sig_info;
856
857                 if (md5sig->alloced4 == md5sig->entries4) {
858                         keys = kmalloc((sizeof(*keys) *
859                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
860                         if (!keys) {
861                                 kfree(newkey);
862                                 tcp_free_md5sig_pool();
863                                 return -ENOMEM;
864                         }
865
866                         if (md5sig->entries4)
867                                 memcpy(keys, md5sig->keys4,
868                                        sizeof(*keys) * md5sig->entries4);
869
870                         /* Free old key list, and reference new one */
871                         kfree(md5sig->keys4);
872                         md5sig->keys4 = keys;
873                         md5sig->alloced4++;
874                 }
875                 md5sig->entries4++;
876                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
877                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
878                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
879         }
880         return 0;
881 }
882
883 EXPORT_SYMBOL(tcp_v4_md5_do_add);
884
885 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
886                                u8 *newkey, u8 newkeylen)
887 {
888         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
889                                  newkey, newkeylen);
890 }
891
892 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
893 {
894         struct tcp_sock *tp = tcp_sk(sk);
895         int i;
896
897         for (i = 0; i < tp->md5sig_info->entries4; i++) {
898                 if (tp->md5sig_info->keys4[i].addr == addr) {
899                         /* Free the key */
900                         kfree(tp->md5sig_info->keys4[i].base.key);
901                         tp->md5sig_info->entries4--;
902
903                         if (tp->md5sig_info->entries4 == 0) {
904                                 kfree(tp->md5sig_info->keys4);
905                                 tp->md5sig_info->keys4 = NULL;
906                                 tp->md5sig_info->alloced4 = 0;
907                         } else if (tp->md5sig_info->entries4 != i) {
908                                 /* Need to do some manipulation */
909                                 memmove(&tp->md5sig_info->keys4[i],
910                                         &tp->md5sig_info->keys4[i+1],
911                                         (tp->md5sig_info->entries4 - i) *
912                                          sizeof(struct tcp4_md5sig_key));
913                         }
914                         tcp_free_md5sig_pool();
915                         return 0;
916                 }
917         }
918         return -ENOENT;
919 }
920
921 EXPORT_SYMBOL(tcp_v4_md5_do_del);
922
923 static void tcp_v4_clear_md5_list(struct sock *sk)
924 {
925         struct tcp_sock *tp = tcp_sk(sk);
926
927         /* Free each key, then the set of key keys,
928          * the crypto element, and then decrement our
929          * hold on the last resort crypto.
930          */
931         if (tp->md5sig_info->entries4) {
932                 int i;
933                 for (i = 0; i < tp->md5sig_info->entries4; i++)
934                         kfree(tp->md5sig_info->keys4[i].base.key);
935                 tp->md5sig_info->entries4 = 0;
936                 tcp_free_md5sig_pool();
937         }
938         if (tp->md5sig_info->keys4) {
939                 kfree(tp->md5sig_info->keys4);
940                 tp->md5sig_info->keys4 = NULL;
941                 tp->md5sig_info->alloced4  = 0;
942         }
943 }
944
945 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
946                                  int optlen)
947 {
948         struct tcp_md5sig cmd;
949         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
950         u8 *newkey;
951
952         if (optlen < sizeof(cmd))
953                 return -EINVAL;
954
955         if (copy_from_user(&cmd, optval, sizeof(cmd)))
956                 return -EFAULT;
957
958         if (sin->sin_family != AF_INET)
959                 return -EINVAL;
960
961         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
962                 if (!tcp_sk(sk)->md5sig_info)
963                         return -ENOENT;
964                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
965         }
966
967         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
968                 return -EINVAL;
969
970         if (!tcp_sk(sk)->md5sig_info) {
971                 struct tcp_sock *tp = tcp_sk(sk);
972                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
973
974                 if (!p)
975                         return -EINVAL;
976
977                 tp->md5sig_info = p;
978                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
979         }
980
981         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
982         if (!newkey)
983                 return -ENOMEM;
984         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
985                                  newkey, cmd.tcpm_keylen);
986 }
987
988 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
989                                    __be32 saddr, __be32 daddr,
990                                    struct tcphdr *th,
991                                    unsigned int tcplen)
992 {
993         struct tcp_md5sig_pool *hp;
994         struct tcp4_pseudohdr *bp;
995         int err;
996
997         /*
998          * Okay, so RFC2385 is turned on for this connection,
999          * so we need to generate the MD5 hash for the packet now.
1000          */
1001
1002         hp = tcp_get_md5sig_pool();
1003         if (!hp)
1004                 goto clear_hash_noput;
1005
1006         bp = &hp->md5_blk.ip4;
1007
1008         /*
1009          * The TCP pseudo-header (in the order: source IP address,
1010          * destination IP address, zero-padded protocol number, and
1011          * segment length)
1012          */
1013         bp->saddr = saddr;
1014         bp->daddr = daddr;
1015         bp->pad = 0;
1016         bp->protocol = IPPROTO_TCP;
1017         bp->len = htons(tcplen);
1018
1019         err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1020                                 th, tcplen, hp);
1021         if (err)
1022                 goto clear_hash;
1023
1024         /* Free up the crypto pool */
1025         tcp_put_md5sig_pool();
1026 out:
1027         return 0;
1028 clear_hash:
1029         tcp_put_md5sig_pool();
1030 clear_hash_noput:
1031         memset(md5_hash, 0, 16);
1032         goto out;
1033 }
1034
1035 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1036                          struct sock *sk,
1037                          struct dst_entry *dst,
1038                          struct request_sock *req,
1039                          struct tcphdr *th,
1040                          unsigned int tcplen)
1041 {
1042         __be32 saddr, daddr;
1043
1044         if (sk) {
1045                 saddr = inet_sk(sk)->saddr;
1046                 daddr = inet_sk(sk)->daddr;
1047         } else {
1048                 struct rtable *rt = (struct rtable *)dst;
1049                 BUG_ON(!rt);
1050                 saddr = rt->rt_src;
1051                 daddr = rt->rt_dst;
1052         }
1053         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1054                                        saddr, daddr,
1055                                        th, tcplen);
1056 }
1057
1058 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1059
1060 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1061 {
1062         /*
1063          * This gets called for each TCP segment that arrives
1064          * so we want to be efficient.
1065          * We have 3 drop cases:
1066          * o No MD5 hash and one expected.
1067          * o MD5 hash and we're not expecting one.
1068          * o MD5 hash and its wrong.
1069          */
1070         __u8 *hash_location = NULL;
1071         struct tcp_md5sig_key *hash_expected;
1072         const struct iphdr *iph = ip_hdr(skb);
1073         struct tcphdr *th = tcp_hdr(skb);
1074         int genhash;
1075         unsigned char newhash[16];
1076
1077         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1078         hash_location = tcp_parse_md5sig_option(th);
1079
1080         /* We've parsed the options - do we have a hash? */
1081         if (!hash_expected && !hash_location)
1082                 return 0;
1083
1084         if (hash_expected && !hash_location) {
1085                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1086                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1087                                NIPQUAD(iph->saddr), ntohs(th->source),
1088                                NIPQUAD(iph->daddr), ntohs(th->dest));
1089                 return 1;
1090         }
1091
1092         if (!hash_expected && hash_location) {
1093                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1094                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1095                                NIPQUAD(iph->saddr), ntohs(th->source),
1096                                NIPQUAD(iph->daddr), ntohs(th->dest));
1097                 return 1;
1098         }
1099
1100         /* Okay, so this is hash_expected and hash_location -
1101          * so we need to calculate the checksum.
1102          */
1103         genhash = tcp_v4_do_calc_md5_hash(newhash,
1104                                           hash_expected,
1105                                           iph->saddr, iph->daddr,
1106                                           th, skb->len);
1107
1108         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1109                 if (net_ratelimit()) {
1110                         printk(KERN_INFO "MD5 Hash failed for "
1111                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1112                                NIPQUAD(iph->saddr), ntohs(th->source),
1113                                NIPQUAD(iph->daddr), ntohs(th->dest),
1114                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1115                 }
1116                 return 1;
1117         }
1118         return 0;
1119 }
1120
1121 #endif
1122
1123 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1124         .family         =       PF_INET,
1125         .obj_size       =       sizeof(struct tcp_request_sock),
1126         .rtx_syn_ack    =       tcp_v4_send_synack,
1127         .send_ack       =       tcp_v4_reqsk_send_ack,
1128         .destructor     =       tcp_v4_reqsk_destructor,
1129         .send_reset     =       tcp_v4_send_reset,
1130 };
1131
1132 #ifdef CONFIG_TCP_MD5SIG
1133 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1134         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1135 };
1136 #endif
1137
1138 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1139         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1140         .twsk_unique    = tcp_twsk_unique,
1141         .twsk_destructor= tcp_twsk_destructor,
1142 };
1143
1144 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1145 {
1146         struct inet_request_sock *ireq;
1147         struct tcp_options_received tmp_opt;
1148         struct request_sock *req;
1149         __be32 saddr = ip_hdr(skb)->saddr;
1150         __be32 daddr = ip_hdr(skb)->daddr;
1151         __u32 isn = TCP_SKB_CB(skb)->when;
1152         struct dst_entry *dst = NULL;
1153 #ifdef CONFIG_SYN_COOKIES
1154         int want_cookie = 0;
1155 #else
1156 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1157 #endif
1158
1159         /* Never answer to SYNs send to broadcast or multicast */
1160         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1161                 goto drop;
1162
1163         /* TW buckets are converted to open requests without
1164          * limitations, they conserve resources and peer is
1165          * evidently real one.
1166          */
1167         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1168 #ifdef CONFIG_SYN_COOKIES
1169                 if (sysctl_tcp_syncookies) {
1170                         want_cookie = 1;
1171                 } else
1172 #endif
1173                 goto drop;
1174         }
1175
1176         /* Accept backlog is full. If we have already queued enough
1177          * of warm entries in syn queue, drop request. It is better than
1178          * clogging syn queue with openreqs with exponentially increasing
1179          * timeout.
1180          */
1181         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1182                 goto drop;
1183
1184         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1185         if (!req)
1186                 goto drop;
1187
1188 #ifdef CONFIG_TCP_MD5SIG
1189         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1190 #endif
1191
1192         tcp_clear_options(&tmp_opt);
1193         tmp_opt.mss_clamp = 536;
1194         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1195
1196         tcp_parse_options(skb, &tmp_opt, 0);
1197
1198         if (want_cookie && !tmp_opt.saw_tstamp)
1199                 tcp_clear_options(&tmp_opt);
1200
1201         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1202                 /* Some OSes (unknown ones, but I see them on web server, which
1203                  * contains information interesting only for windows'
1204                  * users) do not send their stamp in SYN. It is easy case.
1205                  * We simply do not advertise TS support.
1206                  */
1207                 tmp_opt.saw_tstamp = 0;
1208                 tmp_opt.tstamp_ok  = 0;
1209         }
1210         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1211
1212         tcp_openreq_init(req, &tmp_opt, skb);
1213
1214         if (security_inet_conn_request(sk, skb, req))
1215                 goto drop_and_free;
1216
1217         ireq = inet_rsk(req);
1218         ireq->loc_addr = daddr;
1219         ireq->rmt_addr = saddr;
1220         ireq->opt = tcp_v4_save_options(sk, skb);
1221         if (!want_cookie)
1222                 TCP_ECN_create_request(req, tcp_hdr(skb));
1223
1224         if (want_cookie) {
1225 #ifdef CONFIG_SYN_COOKIES
1226                 syn_flood_warning(skb);
1227                 req->cookie_ts = tmp_opt.tstamp_ok;
1228 #endif
1229                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1230         } else if (!isn) {
1231                 struct inet_peer *peer = NULL;
1232
1233                 /* VJ's idea. We save last timestamp seen
1234                  * from the destination in peer table, when entering
1235                  * state TIME-WAIT, and check against it before
1236                  * accepting new connection request.
1237                  *
1238                  * If "isn" is not zero, this request hit alive
1239                  * timewait bucket, so that all the necessary checks
1240                  * are made in the function processing timewait state.
1241                  */
1242                 if (tmp_opt.saw_tstamp &&
1243                     tcp_death_row.sysctl_tw_recycle &&
1244                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1245                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1246                     peer->v4daddr == saddr) {
1247                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1248                             (s32)(peer->tcp_ts - req->ts_recent) >
1249                                                         TCP_PAWS_WINDOW) {
1250                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1251                                 goto drop_and_release;
1252                         }
1253                 }
1254                 /* Kill the following clause, if you dislike this way. */
1255                 else if (!sysctl_tcp_syncookies &&
1256                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1257                           (sysctl_max_syn_backlog >> 2)) &&
1258                          (!peer || !peer->tcp_ts_stamp) &&
1259                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1260                         /* Without syncookies last quarter of
1261                          * backlog is filled with destinations,
1262                          * proven to be alive.
1263                          * It means that we continue to communicate
1264                          * to destinations, already remembered
1265                          * to the moment of synflood.
1266                          */
1267                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1268                                        "request from " NIPQUAD_FMT "/%u\n",
1269                                        NIPQUAD(saddr),
1270                                        ntohs(tcp_hdr(skb)->source));
1271                         goto drop_and_release;
1272                 }
1273
1274                 isn = tcp_v4_init_sequence(skb);
1275         }
1276         tcp_rsk(req)->snt_isn = isn;
1277
1278         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1279                 goto drop_and_free;
1280
1281         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1282         return 0;
1283
1284 drop_and_release:
1285         dst_release(dst);
1286 drop_and_free:
1287         reqsk_free(req);
1288 drop:
1289         return 0;
1290 }
1291
1292
1293 /*
1294  * The three way handshake has completed - we got a valid synack -
1295  * now create the new socket.
1296  */
1297 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1298                                   struct request_sock *req,
1299                                   struct dst_entry *dst)
1300 {
1301         struct inet_request_sock *ireq;
1302         struct inet_sock *newinet;
1303         struct tcp_sock *newtp;
1304         struct sock *newsk;
1305 #ifdef CONFIG_TCP_MD5SIG
1306         struct tcp_md5sig_key *key;
1307 #endif
1308
1309         if (sk_acceptq_is_full(sk))
1310                 goto exit_overflow;
1311
1312         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1313                 goto exit;
1314
1315         newsk = tcp_create_openreq_child(sk, req, skb);
1316         if (!newsk)
1317                 goto exit;
1318
1319         newsk->sk_gso_type = SKB_GSO_TCPV4;
1320         sk_setup_caps(newsk, dst);
1321
1322         newtp                 = tcp_sk(newsk);
1323         newinet               = inet_sk(newsk);
1324         ireq                  = inet_rsk(req);
1325         newinet->daddr        = ireq->rmt_addr;
1326         newinet->rcv_saddr    = ireq->loc_addr;
1327         newinet->saddr        = ireq->loc_addr;
1328         newinet->opt          = ireq->opt;
1329         ireq->opt             = NULL;
1330         newinet->mc_index     = inet_iif(skb);
1331         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1332         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1333         if (newinet->opt)
1334                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1335         newinet->id = newtp->write_seq ^ jiffies;
1336
1337         tcp_mtup_init(newsk);
1338         tcp_sync_mss(newsk, dst_mtu(dst));
1339         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1340         tcp_initialize_rcv_mss(newsk);
1341
1342 #ifdef CONFIG_TCP_MD5SIG
1343         /* Copy over the MD5 key from the original socket */
1344         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1345                 /*
1346                  * We're using one, so create a matching key
1347                  * on the newsk structure. If we fail to get
1348                  * memory, then we end up not copying the key
1349                  * across. Shucks.
1350                  */
1351                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1352                 if (newkey != NULL)
1353                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1354                                           newkey, key->keylen);
1355         }
1356 #endif
1357
1358         __inet_hash_nolisten(newsk);
1359         __inet_inherit_port(sk, newsk);
1360
1361         return newsk;
1362
1363 exit_overflow:
1364         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1365 exit:
1366         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1367         dst_release(dst);
1368         return NULL;
1369 }
1370
1371 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1372 {
1373         struct tcphdr *th = tcp_hdr(skb);
1374         const struct iphdr *iph = ip_hdr(skb);
1375         struct sock *nsk;
1376         struct request_sock **prev;
1377         /* Find possible connection requests. */
1378         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1379                                                        iph->saddr, iph->daddr);
1380         if (req)
1381                 return tcp_check_req(sk, skb, req, prev);
1382
1383         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1384                         th->source, iph->daddr, th->dest, inet_iif(skb));
1385
1386         if (nsk) {
1387                 if (nsk->sk_state != TCP_TIME_WAIT) {
1388                         bh_lock_sock(nsk);
1389                         return nsk;
1390                 }
1391                 inet_twsk_put(inet_twsk(nsk));
1392                 return NULL;
1393         }
1394
1395 #ifdef CONFIG_SYN_COOKIES
1396         if (!th->rst && !th->syn && th->ack)
1397                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1398 #endif
1399         return sk;
1400 }
1401
1402 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1403 {
1404         const struct iphdr *iph = ip_hdr(skb);
1405
1406         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1407                 if (!tcp_v4_check(skb->len, iph->saddr,
1408                                   iph->daddr, skb->csum)) {
1409                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1410                         return 0;
1411                 }
1412         }
1413
1414         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1415                                        skb->len, IPPROTO_TCP, 0);
1416
1417         if (skb->len <= 76) {
1418                 return __skb_checksum_complete(skb);
1419         }
1420         return 0;
1421 }
1422
1423
1424 /* The socket must have it's spinlock held when we get
1425  * here.
1426  *
1427  * We have a potential double-lock case here, so even when
1428  * doing backlog processing we use the BH locking scheme.
1429  * This is because we cannot sleep with the original spinlock
1430  * held.
1431  */
1432 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1433 {
1434         struct sock *rsk;
1435 #ifdef CONFIG_TCP_MD5SIG
1436         /*
1437          * We really want to reject the packet as early as possible
1438          * if:
1439          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1440          *  o There is an MD5 option and we're not expecting one
1441          */
1442         if (tcp_v4_inbound_md5_hash(sk, skb))
1443                 goto discard;
1444 #endif
1445
1446         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1447                 TCP_CHECK_TIMER(sk);
1448                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1449                         rsk = sk;
1450                         goto reset;
1451                 }
1452                 TCP_CHECK_TIMER(sk);
1453                 return 0;
1454         }
1455
1456         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1457                 goto csum_err;
1458
1459         if (sk->sk_state == TCP_LISTEN) {
1460                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1461                 if (!nsk)
1462                         goto discard;
1463
1464                 if (nsk != sk) {
1465                         if (tcp_child_process(sk, nsk, skb)) {
1466                                 rsk = nsk;
1467                                 goto reset;
1468                         }
1469                         return 0;
1470                 }
1471         }
1472
1473         TCP_CHECK_TIMER(sk);
1474         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1475                 rsk = sk;
1476                 goto reset;
1477         }
1478         TCP_CHECK_TIMER(sk);
1479         return 0;
1480
1481 reset:
1482         tcp_v4_send_reset(rsk, skb);
1483 discard:
1484         kfree_skb(skb);
1485         /* Be careful here. If this function gets more complicated and
1486          * gcc suffers from register pressure on the x86, sk (in %ebx)
1487          * might be destroyed here. This current version compiles correctly,
1488          * but you have been warned.
1489          */
1490         return 0;
1491
1492 csum_err:
1493         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1494         goto discard;
1495 }
1496
1497 /*
1498  *      From tcp_input.c
1499  */
1500
1501 int tcp_v4_rcv(struct sk_buff *skb)
1502 {
1503         const struct iphdr *iph;
1504         struct tcphdr *th;
1505         struct sock *sk;
1506         int ret;
1507
1508         if (skb->pkt_type != PACKET_HOST)
1509                 goto discard_it;
1510
1511         /* Count it even if it's bad */
1512         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1513
1514         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1515                 goto discard_it;
1516
1517         th = tcp_hdr(skb);
1518
1519         if (th->doff < sizeof(struct tcphdr) / 4)
1520                 goto bad_packet;
1521         if (!pskb_may_pull(skb, th->doff * 4))
1522                 goto discard_it;
1523
1524         /* An explanation is required here, I think.
1525          * Packet length and doff are validated by header prediction,
1526          * provided case of th->doff==0 is eliminated.
1527          * So, we defer the checks. */
1528         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1529                 goto bad_packet;
1530
1531         th = tcp_hdr(skb);
1532         iph = ip_hdr(skb);
1533         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1534         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1535                                     skb->len - th->doff * 4);
1536         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1537         TCP_SKB_CB(skb)->when    = 0;
1538         TCP_SKB_CB(skb)->flags   = iph->tos;
1539         TCP_SKB_CB(skb)->sacked  = 0;
1540
1541         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1542                         th->source, iph->daddr, th->dest, inet_iif(skb));
1543         if (!sk)
1544                 goto no_tcp_socket;
1545
1546 process:
1547         if (sk->sk_state == TCP_TIME_WAIT)
1548                 goto do_time_wait;
1549
1550         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1551                 goto discard_and_relse;
1552         nf_reset(skb);
1553
1554         if (sk_filter(sk, skb))
1555                 goto discard_and_relse;
1556
1557         skb->dev = NULL;
1558
1559         bh_lock_sock_nested(sk);
1560         ret = 0;
1561         if (!sock_owned_by_user(sk)) {
1562 #ifdef CONFIG_NET_DMA
1563                 struct tcp_sock *tp = tcp_sk(sk);
1564                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1565                         tp->ucopy.dma_chan = get_softnet_dma();
1566                 if (tp->ucopy.dma_chan)
1567                         ret = tcp_v4_do_rcv(sk, skb);
1568                 else
1569 #endif
1570                 {
1571                         if (!tcp_prequeue(sk, skb))
1572                         ret = tcp_v4_do_rcv(sk, skb);
1573                 }
1574         } else
1575                 sk_add_backlog(sk, skb);
1576         bh_unlock_sock(sk);
1577
1578         sock_put(sk);
1579
1580         return ret;
1581
1582 no_tcp_socket:
1583         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1584                 goto discard_it;
1585
1586         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1587 bad_packet:
1588                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1589         } else {
1590                 tcp_v4_send_reset(NULL, skb);
1591         }
1592
1593 discard_it:
1594         /* Discard frame. */
1595         kfree_skb(skb);
1596         return 0;
1597
1598 discard_and_relse:
1599         sock_put(sk);
1600         goto discard_it;
1601
1602 do_time_wait:
1603         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1604                 inet_twsk_put(inet_twsk(sk));
1605                 goto discard_it;
1606         }
1607
1608         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1609                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1610                 inet_twsk_put(inet_twsk(sk));
1611                 goto discard_it;
1612         }
1613         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1614         case TCP_TW_SYN: {
1615                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1616                                                         &tcp_hashinfo,
1617                                                         iph->daddr, th->dest,
1618                                                         inet_iif(skb));
1619                 if (sk2) {
1620                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1621                         inet_twsk_put(inet_twsk(sk));
1622                         sk = sk2;
1623                         goto process;
1624                 }
1625                 /* Fall through to ACK */
1626         }
1627         case TCP_TW_ACK:
1628                 tcp_v4_timewait_ack(sk, skb);
1629                 break;
1630         case TCP_TW_RST:
1631                 goto no_tcp_socket;
1632         case TCP_TW_SUCCESS:;
1633         }
1634         goto discard_it;
1635 }
1636
1637 /* VJ's idea. Save last timestamp seen from this destination
1638  * and hold it at least for normal timewait interval to use for duplicate
1639  * segment detection in subsequent connections, before they enter synchronized
1640  * state.
1641  */
1642
1643 int tcp_v4_remember_stamp(struct sock *sk)
1644 {
1645         struct inet_sock *inet = inet_sk(sk);
1646         struct tcp_sock *tp = tcp_sk(sk);
1647         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1648         struct inet_peer *peer = NULL;
1649         int release_it = 0;
1650
1651         if (!rt || rt->rt_dst != inet->daddr) {
1652                 peer = inet_getpeer(inet->daddr, 1);
1653                 release_it = 1;
1654         } else {
1655                 if (!rt->peer)
1656                         rt_bind_peer(rt, 1);
1657                 peer = rt->peer;
1658         }
1659
1660         if (peer) {
1661                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1662                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1663                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1664                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1665                         peer->tcp_ts = tp->rx_opt.ts_recent;
1666                 }
1667                 if (release_it)
1668                         inet_putpeer(peer);
1669                 return 1;
1670         }
1671
1672         return 0;
1673 }
1674
1675 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1676 {
1677         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1678
1679         if (peer) {
1680                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1681
1682                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1683                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1684                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1685                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1686                         peer->tcp_ts       = tcptw->tw_ts_recent;
1687                 }
1688                 inet_putpeer(peer);
1689                 return 1;
1690         }
1691
1692         return 0;
1693 }
1694
1695 struct inet_connection_sock_af_ops ipv4_specific = {
1696         .queue_xmit        = ip_queue_xmit,
1697         .send_check        = tcp_v4_send_check,
1698         .rebuild_header    = inet_sk_rebuild_header,
1699         .conn_request      = tcp_v4_conn_request,
1700         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1701         .remember_stamp    = tcp_v4_remember_stamp,
1702         .net_header_len    = sizeof(struct iphdr),
1703         .setsockopt        = ip_setsockopt,
1704         .getsockopt        = ip_getsockopt,
1705         .addr2sockaddr     = inet_csk_addr2sockaddr,
1706         .sockaddr_len      = sizeof(struct sockaddr_in),
1707         .bind_conflict     = inet_csk_bind_conflict,
1708 #ifdef CONFIG_COMPAT
1709         .compat_setsockopt = compat_ip_setsockopt,
1710         .compat_getsockopt = compat_ip_getsockopt,
1711 #endif
1712 };
1713
1714 #ifdef CONFIG_TCP_MD5SIG
1715 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1716         .md5_lookup             = tcp_v4_md5_lookup,
1717         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1718         .md5_add                = tcp_v4_md5_add_func,
1719         .md5_parse              = tcp_v4_parse_md5_keys,
1720 };
1721 #endif
1722
1723 /* NOTE: A lot of things set to zero explicitly by call to
1724  *       sk_alloc() so need not be done here.
1725  */
1726 static int tcp_v4_init_sock(struct sock *sk)
1727 {
1728         struct inet_connection_sock *icsk = inet_csk(sk);
1729         struct tcp_sock *tp = tcp_sk(sk);
1730
1731         skb_queue_head_init(&tp->out_of_order_queue);
1732         tcp_init_xmit_timers(sk);
1733         tcp_prequeue_init(tp);
1734
1735         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1736         tp->mdev = TCP_TIMEOUT_INIT;
1737
1738         /* So many TCP implementations out there (incorrectly) count the
1739          * initial SYN frame in their delayed-ACK and congestion control
1740          * algorithms that we must have the following bandaid to talk
1741          * efficiently to them.  -DaveM
1742          */
1743         tp->snd_cwnd = 2;
1744
1745         /* See draft-stevens-tcpca-spec-01 for discussion of the
1746          * initialization of these values.
1747          */
1748         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1749         tp->snd_cwnd_clamp = ~0;
1750         tp->mss_cache = 536;
1751
1752         tp->reordering = sysctl_tcp_reordering;
1753         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1754
1755         sk->sk_state = TCP_CLOSE;
1756
1757         sk->sk_write_space = sk_stream_write_space;
1758         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1759
1760         icsk->icsk_af_ops = &ipv4_specific;
1761         icsk->icsk_sync_mss = tcp_sync_mss;
1762 #ifdef CONFIG_TCP_MD5SIG
1763         tp->af_specific = &tcp_sock_ipv4_specific;
1764 #endif
1765
1766         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1767         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1768
1769         atomic_inc(&tcp_sockets_allocated);
1770
1771         return 0;
1772 }
1773
1774 void tcp_v4_destroy_sock(struct sock *sk)
1775 {
1776         struct tcp_sock *tp = tcp_sk(sk);
1777
1778         tcp_clear_xmit_timers(sk);
1779
1780         tcp_cleanup_congestion_control(sk);
1781
1782         /* Cleanup up the write buffer. */
1783         tcp_write_queue_purge(sk);
1784
1785         /* Cleans up our, hopefully empty, out_of_order_queue. */
1786         __skb_queue_purge(&tp->out_of_order_queue);
1787
1788 #ifdef CONFIG_TCP_MD5SIG
1789         /* Clean up the MD5 key list, if any */
1790         if (tp->md5sig_info) {
1791                 tcp_v4_clear_md5_list(sk);
1792                 kfree(tp->md5sig_info);
1793                 tp->md5sig_info = NULL;
1794         }
1795 #endif
1796
1797 #ifdef CONFIG_NET_DMA
1798         /* Cleans up our sk_async_wait_queue */
1799         __skb_queue_purge(&sk->sk_async_wait_queue);
1800 #endif
1801
1802         /* Clean prequeue, it must be empty really */
1803         __skb_queue_purge(&tp->ucopy.prequeue);
1804
1805         /* Clean up a referenced TCP bind bucket. */
1806         if (inet_csk(sk)->icsk_bind_hash)
1807                 inet_put_port(sk);
1808
1809         /*
1810          * If sendmsg cached page exists, toss it.
1811          */
1812         if (sk->sk_sndmsg_page) {
1813                 __free_page(sk->sk_sndmsg_page);
1814                 sk->sk_sndmsg_page = NULL;
1815         }
1816
1817         atomic_dec(&tcp_sockets_allocated);
1818 }
1819
1820 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1821
1822 #ifdef CONFIG_PROC_FS
1823 /* Proc filesystem TCP sock list dumping. */
1824
1825 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1826 {
1827         return hlist_empty(head) ? NULL :
1828                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1829 }
1830
1831 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1832 {
1833         return tw->tw_node.next ?
1834                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1835 }
1836
1837 static void *listening_get_next(struct seq_file *seq, void *cur)
1838 {
1839         struct inet_connection_sock *icsk;
1840         struct hlist_node *node;
1841         struct sock *sk = cur;
1842         struct tcp_iter_state* st = seq->private;
1843         struct net *net = seq_file_net(seq);
1844
1845         if (!sk) {
1846                 st->bucket = 0;
1847                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1848                 goto get_sk;
1849         }
1850
1851         ++st->num;
1852
1853         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1854                 struct request_sock *req = cur;
1855
1856                 icsk = inet_csk(st->syn_wait_sk);
1857                 req = req->dl_next;
1858                 while (1) {
1859                         while (req) {
1860                                 if (req->rsk_ops->family == st->family &&
1861                                     net_eq(sock_net(req->sk), net)) {
1862                                         cur = req;
1863                                         goto out;
1864                                 }
1865                                 req = req->dl_next;
1866                         }
1867                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1868                                 break;
1869 get_req:
1870                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1871                 }
1872                 sk        = sk_next(st->syn_wait_sk);
1873                 st->state = TCP_SEQ_STATE_LISTENING;
1874                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1875         } else {
1876                 icsk = inet_csk(sk);
1877                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1878                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1879                         goto start_req;
1880                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1881                 sk = sk_next(sk);
1882         }
1883 get_sk:
1884         sk_for_each_from(sk, node) {
1885                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1886                         cur = sk;
1887                         goto out;
1888                 }
1889                 icsk = inet_csk(sk);
1890                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1891                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1892 start_req:
1893                         st->uid         = sock_i_uid(sk);
1894                         st->syn_wait_sk = sk;
1895                         st->state       = TCP_SEQ_STATE_OPENREQ;
1896                         st->sbucket     = 0;
1897                         goto get_req;
1898                 }
1899                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1900         }
1901         if (++st->bucket < INET_LHTABLE_SIZE) {
1902                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1903                 goto get_sk;
1904         }
1905         cur = NULL;
1906 out:
1907         return cur;
1908 }
1909
1910 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1911 {
1912         void *rc = listening_get_next(seq, NULL);
1913
1914         while (rc && *pos) {
1915                 rc = listening_get_next(seq, rc);
1916                 --*pos;
1917         }
1918         return rc;
1919 }
1920
1921 static void *established_get_first(struct seq_file *seq)
1922 {
1923         struct tcp_iter_state* st = seq->private;
1924         struct net *net = seq_file_net(seq);
1925         void *rc = NULL;
1926
1927         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1928                 struct sock *sk;
1929                 struct hlist_node *node;
1930                 struct inet_timewait_sock *tw;
1931                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1932
1933                 read_lock_bh(lock);
1934                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1935                         if (sk->sk_family != st->family ||
1936                             !net_eq(sock_net(sk), net)) {
1937                                 continue;
1938                         }
1939                         rc = sk;
1940                         goto out;
1941                 }
1942                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1943                 inet_twsk_for_each(tw, node,
1944                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1945                         if (tw->tw_family != st->family ||
1946                             !net_eq(twsk_net(tw), net)) {
1947                                 continue;
1948                         }
1949                         rc = tw;
1950                         goto out;
1951                 }
1952                 read_unlock_bh(lock);
1953                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1954         }
1955 out:
1956         return rc;
1957 }
1958
1959 static void *established_get_next(struct seq_file *seq, void *cur)
1960 {
1961         struct sock *sk = cur;
1962         struct inet_timewait_sock *tw;
1963         struct hlist_node *node;
1964         struct tcp_iter_state* st = seq->private;
1965         struct net *net = seq_file_net(seq);
1966
1967         ++st->num;
1968
1969         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1970                 tw = cur;
1971                 tw = tw_next(tw);
1972 get_tw:
1973                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1974                         tw = tw_next(tw);
1975                 }
1976                 if (tw) {
1977                         cur = tw;
1978                         goto out;
1979                 }
1980                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1981                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1982
1983                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1984                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1985                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1986                 } else {
1987                         cur = NULL;
1988                         goto out;
1989                 }
1990         } else
1991                 sk = sk_next(sk);
1992
1993         sk_for_each_from(sk, node) {
1994                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1995                         goto found;
1996         }
1997
1998         st->state = TCP_SEQ_STATE_TIME_WAIT;
1999         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2000         goto get_tw;
2001 found:
2002         cur = sk;
2003 out:
2004         return cur;
2005 }
2006
2007 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2008 {
2009         void *rc = established_get_first(seq);
2010
2011         while (rc && pos) {
2012                 rc = established_get_next(seq, rc);
2013                 --pos;
2014         }
2015         return rc;
2016 }
2017
2018 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2019 {
2020         void *rc;
2021         struct tcp_iter_state* st = seq->private;
2022
2023         inet_listen_lock(&tcp_hashinfo);
2024         st->state = TCP_SEQ_STATE_LISTENING;
2025         rc        = listening_get_idx(seq, &pos);
2026
2027         if (!rc) {
2028                 inet_listen_unlock(&tcp_hashinfo);
2029                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2030                 rc        = established_get_idx(seq, pos);
2031         }
2032
2033         return rc;
2034 }
2035
2036 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2037 {
2038         struct tcp_iter_state* st = seq->private;
2039         st->state = TCP_SEQ_STATE_LISTENING;
2040         st->num = 0;
2041         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2042 }
2043
2044 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2045 {
2046         void *rc = NULL;
2047         struct tcp_iter_state* st;
2048
2049         if (v == SEQ_START_TOKEN) {
2050                 rc = tcp_get_idx(seq, 0);
2051                 goto out;
2052         }
2053         st = seq->private;
2054
2055         switch (st->state) {
2056         case TCP_SEQ_STATE_OPENREQ:
2057         case TCP_SEQ_STATE_LISTENING:
2058                 rc = listening_get_next(seq, v);
2059                 if (!rc) {
2060                         inet_listen_unlock(&tcp_hashinfo);
2061                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2062                         rc        = established_get_first(seq);
2063                 }
2064                 break;
2065         case TCP_SEQ_STATE_ESTABLISHED:
2066         case TCP_SEQ_STATE_TIME_WAIT:
2067                 rc = established_get_next(seq, v);
2068                 break;
2069         }
2070 out:
2071         ++*pos;
2072         return rc;
2073 }
2074
2075 static void tcp_seq_stop(struct seq_file *seq, void *v)
2076 {
2077         struct tcp_iter_state* st = seq->private;
2078
2079         switch (st->state) {
2080         case TCP_SEQ_STATE_OPENREQ:
2081                 if (v) {
2082                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2083                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2084                 }
2085         case TCP_SEQ_STATE_LISTENING:
2086                 if (v != SEQ_START_TOKEN)
2087                         inet_listen_unlock(&tcp_hashinfo);
2088                 break;
2089         case TCP_SEQ_STATE_TIME_WAIT:
2090         case TCP_SEQ_STATE_ESTABLISHED:
2091                 if (v)
2092                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2093                 break;
2094         }
2095 }
2096
2097 static int tcp_seq_open(struct inode *inode, struct file *file)
2098 {
2099         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2100         struct tcp_iter_state *s;
2101         int err;
2102
2103         err = seq_open_net(inode, file, &afinfo->seq_ops,
2104                           sizeof(struct tcp_iter_state));
2105         if (err < 0)
2106                 return err;
2107
2108         s = ((struct seq_file *)file->private_data)->private;
2109         s->family               = afinfo->family;
2110         return 0;
2111 }
2112
2113 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2114 {
2115         int rc = 0;
2116         struct proc_dir_entry *p;
2117
2118         afinfo->seq_fops.open           = tcp_seq_open;
2119         afinfo->seq_fops.read           = seq_read;
2120         afinfo->seq_fops.llseek         = seq_lseek;
2121         afinfo->seq_fops.release        = seq_release_net;
2122
2123         afinfo->seq_ops.start           = tcp_seq_start;
2124         afinfo->seq_ops.next            = tcp_seq_next;
2125         afinfo->seq_ops.stop            = tcp_seq_stop;
2126
2127         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2128                              &afinfo->seq_fops, afinfo);
2129         if (!p)
2130                 rc = -ENOMEM;
2131         return rc;
2132 }
2133
2134 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2135 {
2136         proc_net_remove(net, afinfo->name);
2137 }
2138
2139 static void get_openreq4(struct sock *sk, struct request_sock *req,
2140                          struct seq_file *f, int i, int uid, int *len)
2141 {
2142         const struct inet_request_sock *ireq = inet_rsk(req);
2143         int ttd = req->expires - jiffies;
2144
2145         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2146                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2147                 i,
2148                 ireq->loc_addr,
2149                 ntohs(inet_sk(sk)->sport),
2150                 ireq->rmt_addr,
2151                 ntohs(ireq->rmt_port),
2152                 TCP_SYN_RECV,
2153                 0, 0, /* could print option size, but that is af dependent. */
2154                 1,    /* timers active (only the expire timer) */
2155                 jiffies_to_clock_t(ttd),
2156                 req->retrans,
2157                 uid,
2158                 0,  /* non standard timer */
2159                 0, /* open_requests have no inode */
2160                 atomic_read(&sk->sk_refcnt),
2161                 req,
2162                 len);
2163 }
2164
2165 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2166 {
2167         int timer_active;
2168         unsigned long timer_expires;
2169         struct tcp_sock *tp = tcp_sk(sk);
2170         const struct inet_connection_sock *icsk = inet_csk(sk);
2171         struct inet_sock *inet = inet_sk(sk);
2172         __be32 dest = inet->daddr;
2173         __be32 src = inet->rcv_saddr;
2174         __u16 destp = ntohs(inet->dport);
2175         __u16 srcp = ntohs(inet->sport);
2176
2177         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2178                 timer_active    = 1;
2179                 timer_expires   = icsk->icsk_timeout;
2180         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2181                 timer_active    = 4;
2182                 timer_expires   = icsk->icsk_timeout;
2183         } else if (timer_pending(&sk->sk_timer)) {
2184                 timer_active    = 2;
2185                 timer_expires   = sk->sk_timer.expires;
2186         } else {
2187                 timer_active    = 0;
2188                 timer_expires = jiffies;
2189         }
2190
2191         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2192                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2193                 i, src, srcp, dest, destp, sk->sk_state,
2194                 tp->write_seq - tp->snd_una,
2195                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2196                                              (tp->rcv_nxt - tp->copied_seq),
2197                 timer_active,
2198                 jiffies_to_clock_t(timer_expires - jiffies),
2199                 icsk->icsk_retransmits,
2200                 sock_i_uid(sk),
2201                 icsk->icsk_probes_out,
2202                 sock_i_ino(sk),
2203                 atomic_read(&sk->sk_refcnt), sk,
2204                 jiffies_to_clock_t(icsk->icsk_rto),
2205                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2206                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2207                 tp->snd_cwnd,
2208                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2209                 len);
2210 }
2211
2212 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2213                                struct seq_file *f, int i, int *len)
2214 {
2215         __be32 dest, src;
2216         __u16 destp, srcp;
2217         int ttd = tw->tw_ttd - jiffies;
2218
2219         if (ttd < 0)
2220                 ttd = 0;
2221
2222         dest  = tw->tw_daddr;
2223         src   = tw->tw_rcv_saddr;
2224         destp = ntohs(tw->tw_dport);
2225         srcp  = ntohs(tw->tw_sport);
2226
2227         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2228                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2229                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2230                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2231                 atomic_read(&tw->tw_refcnt), tw, len);
2232 }
2233
2234 #define TMPSZ 150
2235
2236 static int tcp4_seq_show(struct seq_file *seq, void *v)
2237 {
2238         struct tcp_iter_state* st;
2239         int len;
2240
2241         if (v == SEQ_START_TOKEN) {
2242                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2243                            "  sl  local_address rem_address   st tx_queue "
2244                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2245                            "inode");
2246                 goto out;
2247         }
2248         st = seq->private;
2249
2250         switch (st->state) {
2251         case TCP_SEQ_STATE_LISTENING:
2252         case TCP_SEQ_STATE_ESTABLISHED:
2253                 get_tcp4_sock(v, seq, st->num, &len);
2254                 break;
2255         case TCP_SEQ_STATE_OPENREQ:
2256                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2257                 break;
2258         case TCP_SEQ_STATE_TIME_WAIT:
2259                 get_timewait4_sock(v, seq, st->num, &len);
2260                 break;
2261         }
2262         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2263 out:
2264         return 0;
2265 }
2266
2267 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2268         .name           = "tcp",
2269         .family         = AF_INET,
2270         .seq_fops       = {
2271                 .owner          = THIS_MODULE,
2272         },
2273         .seq_ops        = {
2274                 .show           = tcp4_seq_show,
2275         },
2276 };
2277
2278 static int tcp4_proc_init_net(struct net *net)
2279 {
2280         return tcp_proc_register(net, &tcp4_seq_afinfo);
2281 }
2282
2283 static void tcp4_proc_exit_net(struct net *net)
2284 {
2285         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2286 }
2287
2288 static struct pernet_operations tcp4_net_ops = {
2289         .init = tcp4_proc_init_net,
2290         .exit = tcp4_proc_exit_net,
2291 };
2292
2293 int __init tcp4_proc_init(void)
2294 {
2295         return register_pernet_subsys(&tcp4_net_ops);
2296 }
2297
2298 void tcp4_proc_exit(void)
2299 {
2300         unregister_pernet_subsys(&tcp4_net_ops);
2301 }
2302 #endif /* CONFIG_PROC_FS */
2303
2304 struct proto tcp_prot = {
2305         .name                   = "TCP",
2306         .owner                  = THIS_MODULE,
2307         .close                  = tcp_close,
2308         .connect                = tcp_v4_connect,
2309         .disconnect             = tcp_disconnect,
2310         .accept                 = inet_csk_accept,
2311         .ioctl                  = tcp_ioctl,
2312         .init                   = tcp_v4_init_sock,
2313         .destroy                = tcp_v4_destroy_sock,
2314         .shutdown               = tcp_shutdown,
2315         .setsockopt             = tcp_setsockopt,
2316         .getsockopt             = tcp_getsockopt,
2317         .recvmsg                = tcp_recvmsg,
2318         .backlog_rcv            = tcp_v4_do_rcv,
2319         .hash                   = inet_hash,
2320         .unhash                 = inet_unhash,
2321         .get_port               = inet_csk_get_port,
2322         .enter_memory_pressure  = tcp_enter_memory_pressure,
2323         .sockets_allocated      = &tcp_sockets_allocated,
2324         .orphan_count           = &tcp_orphan_count,
2325         .memory_allocated       = &tcp_memory_allocated,
2326         .memory_pressure        = &tcp_memory_pressure,
2327         .sysctl_mem             = sysctl_tcp_mem,
2328         .sysctl_wmem            = sysctl_tcp_wmem,
2329         .sysctl_rmem            = sysctl_tcp_rmem,
2330         .max_header             = MAX_TCP_HEADER,
2331         .obj_size               = sizeof(struct tcp_sock),
2332         .twsk_prot              = &tcp_timewait_sock_ops,
2333         .rsk_prot               = &tcp_request_sock_ops,
2334         .h.hashinfo             = &tcp_hashinfo,
2335 #ifdef CONFIG_COMPAT
2336         .compat_setsockopt      = compat_tcp_setsockopt,
2337         .compat_getsockopt      = compat_tcp_getsockopt,
2338 #endif
2339 };
2340
2341
2342 static int __net_init tcp_sk_init(struct net *net)
2343 {
2344         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2345                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2346 }
2347
2348 static void __net_exit tcp_sk_exit(struct net *net)
2349 {
2350         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2351 }
2352
2353 static struct pernet_operations __net_initdata tcp_sk_ops = {
2354        .init = tcp_sk_init,
2355        .exit = tcp_sk_exit,
2356 };
2357
2358 void __init tcp_v4_init(void)
2359 {
2360         if (register_pernet_device(&tcp_sk_ops))
2361                 panic("Failed to create the TCP control socket.\n");
2362 }
2363
2364 EXPORT_SYMBOL(ipv4_specific);
2365 EXPORT_SYMBOL(tcp_hashinfo);
2366 EXPORT_SYMBOL(tcp_prot);
2367 EXPORT_SYMBOL(tcp_v4_conn_request);
2368 EXPORT_SYMBOL(tcp_v4_connect);
2369 EXPORT_SYMBOL(tcp_v4_do_rcv);
2370 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2371 EXPORT_SYMBOL(tcp_v4_send_check);
2372 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2373
2374 #ifdef CONFIG_PROC_FS
2375 EXPORT_SYMBOL(tcp_proc_register);
2376 EXPORT_SYMBOL(tcp_proc_unregister);
2377 #endif
2378 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2379