dccp: Insert feature-negotiation options into skb
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89                                                    __be32 addr);
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91                                __be32 daddr, __be32 saddr, struct tcphdr *th);
92 #else
93 static inline
94 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95 {
96         return NULL;
97 }
98 #endif
99
100 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102         .lhash_users = ATOMIC_INIT(0),
103         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104 };
105
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109                                           ip_hdr(skb)->saddr,
110                                           tcp_hdr(skb)->dest,
111                                           tcp_hdr(skb)->source);
112 }
113
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117         struct tcp_sock *tp = tcp_sk(sk);
118
119         /* With PAWS, it is safe from the viewpoint
120            of data integrity. Even without PAWS it is safe provided sequence
121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123            Actually, the idea is close to VJ's one, only timestamp cache is
124            held not per host, but per port pair and TW bucket is used as state
125            holder.
126
127            If TW bucket has been already destroyed we fall back to VJ's scheme
128            and use initial timestamp retrieved from peer table.
129          */
130         if (tcptw->tw_ts_recent_stamp &&
131             (twp == NULL || (sysctl_tcp_tw_reuse &&
132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134                 if (tp->write_seq == 0)
135                         tp->write_seq = 1;
136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138                 sock_hold(sktw);
139                 return 1;
140         }
141
142         return 0;
143 }
144
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150         struct inet_sock *inet = inet_sk(sk);
151         struct tcp_sock *tp = tcp_sk(sk);
152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153         struct rtable *rt;
154         __be32 daddr, nexthop;
155         int tmp;
156         int err;
157
158         if (addr_len < sizeof(struct sockaddr_in))
159                 return -EINVAL;
160
161         if (usin->sin_family != AF_INET)
162                 return -EAFNOSUPPORT;
163
164         nexthop = daddr = usin->sin_addr.s_addr;
165         if (inet->opt && inet->opt->srr) {
166                 if (!daddr)
167                         return -EINVAL;
168                 nexthop = inet->opt->faddr;
169         }
170
171         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                                IPPROTO_TCP,
174                                inet->sport, usin->sin_port, sk, 1);
175         if (tmp < 0) {
176                 if (tmp == -ENETUNREACH)
177                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return tmp;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet->opt || !inet->opt->srr)
187                 daddr = rt->rt_dst;
188
189         if (!inet->saddr)
190                 inet->saddr = rt->rt_src;
191         inet->rcv_saddr = inet->saddr;
192
193         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 tp->write_seq              = 0;
198         }
199
200         if (tcp_death_row.sysctl_tw_recycle &&
201             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202                 struct inet_peer *peer = rt_get_peer(rt);
203                 /*
204                  * VJ's idea. We save last timestamp seen from
205                  * the destination in peer table, when entering state
206                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207                  * when trying new connection.
208                  */
209                 if (peer != NULL &&
210                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212                         tp->rx_opt.ts_recent = peer->tcp_ts;
213                 }
214         }
215
216         inet->dport = usin->sin_port;
217         inet->daddr = daddr;
218
219         inet_csk(sk)->icsk_ext_hdr_len = 0;
220         if (inet->opt)
221                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223         tp->rx_opt.mss_clamp = 536;
224
225         /* Socket identity is still unknown (sport may be zero).
226          * However we set state to SYN-SENT and not releasing socket
227          * lock select source port, enter ourselves into the hash tables and
228          * complete initialization after this.
229          */
230         tcp_set_state(sk, TCP_SYN_SENT);
231         err = inet_hash_connect(&tcp_death_row, sk);
232         if (err)
233                 goto failure;
234
235         err = ip_route_newports(&rt, IPPROTO_TCP,
236                                 inet->sport, inet->dport, sk);
237         if (err)
238                 goto failure;
239
240         /* OK, now commit destination to socket.  */
241         sk->sk_gso_type = SKB_GSO_TCPV4;
242         sk_setup_caps(sk, &rt->u.dst);
243
244         if (!tp->write_seq)
245                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246                                                            inet->daddr,
247                                                            inet->sport,
248                                                            usin->sin_port);
249
250         inet->id = tp->write_seq ^ jiffies;
251
252         err = tcp_connect(sk);
253         rt = NULL;
254         if (err)
255                 goto failure;
256
257         return 0;
258
259 failure:
260         /*
261          * This unhashes the socket and releases the local port,
262          * if necessary.
263          */
264         tcp_set_state(sk, TCP_CLOSE);
265         ip_rt_put(rt);
266         sk->sk_route_caps = 0;
267         inet->dport = 0;
268         return err;
269 }
270
271 /*
272  * This routine does path mtu discovery as defined in RFC1191.
273  */
274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275 {
276         struct dst_entry *dst;
277         struct inet_sock *inet = inet_sk(sk);
278
279         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280          * send out by Linux are always <576bytes so they should go through
281          * unfragmented).
282          */
283         if (sk->sk_state == TCP_LISTEN)
284                 return;
285
286         /* We don't check in the destentry if pmtu discovery is forbidden
287          * on this route. We just assume that no packet_to_big packets
288          * are send back when pmtu discovery is not active.
289          * There is a small race when the user changes this flag in the
290          * route, but I think that's acceptable.
291          */
292         if ((dst = __sk_dst_check(sk, 0)) == NULL)
293                 return;
294
295         dst->ops->update_pmtu(dst, mtu);
296
297         /* Something is about to be wrong... Remember soft error
298          * for the case, if this connection will not able to recover.
299          */
300         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301                 sk->sk_err_soft = EMSGSIZE;
302
303         mtu = dst_mtu(dst);
304
305         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307                 tcp_sync_mss(sk, mtu);
308
309                 /* Resend the TCP packet because it's
310                  * clear that the old packet has been
311                  * dropped. This is the new "fast" path mtu
312                  * discovery.
313                  */
314                 tcp_simple_retransmit(sk);
315         } /* else let the usual retransmit timer handle it */
316 }
317
318 /*
319  * This routine is called by the ICMP module when it gets some
320  * sort of error condition.  If err < 0 then the socket should
321  * be closed and the error returned to the user.  If err > 0
322  * it's just the icmp type << 8 | icmp code.  After adjustment
323  * header points to the first 8 bytes of the tcp header.  We need
324  * to find the appropriate port.
325  *
326  * The locking strategy used here is very "optimistic". When
327  * someone else accesses the socket the ICMP is just dropped
328  * and for some paths there is no check at all.
329  * A more general error queue to queue errors for later handling
330  * is probably better.
331  *
332  */
333
334 void tcp_v4_err(struct sk_buff *skb, u32 info)
335 {
336         struct iphdr *iph = (struct iphdr *)skb->data;
337         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338         struct tcp_sock *tp;
339         struct inet_sock *inet;
340         const int type = icmp_hdr(skb)->type;
341         const int code = icmp_hdr(skb)->code;
342         struct sock *sk;
343         __u32 seq;
344         int err;
345         struct net *net = dev_net(skb->dev);
346
347         if (skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         tp = tcp_sk(sk);
374         seq = ntohl(th->seq);
375         if (sk->sk_state != TCP_LISTEN &&
376             !between(seq, tp->snd_una, tp->snd_nxt)) {
377                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
378                 goto out;
379         }
380
381         switch (type) {
382         case ICMP_SOURCE_QUENCH:
383                 /* Just silently ignore these. */
384                 goto out;
385         case ICMP_PARAMETERPROB:
386                 err = EPROTO;
387                 break;
388         case ICMP_DEST_UNREACH:
389                 if (code > NR_ICMP_UNREACH)
390                         goto out;
391
392                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393                         if (!sock_owned_by_user(sk))
394                                 do_pmtu_discovery(sk, iph, info);
395                         goto out;
396                 }
397
398                 err = icmp_err_convert[code].errno;
399                 break;
400         case ICMP_TIME_EXCEEDED:
401                 err = EHOSTUNREACH;
402                 break;
403         default:
404                 goto out;
405         }
406
407         switch (sk->sk_state) {
408                 struct request_sock *req, **prev;
409         case TCP_LISTEN:
410                 if (sock_owned_by_user(sk))
411                         goto out;
412
413                 req = inet_csk_search_req(sk, &prev, th->dest,
414                                           iph->daddr, iph->saddr);
415                 if (!req)
416                         goto out;
417
418                 /* ICMPs are not backlogged, hence we cannot get
419                    an established socket here.
420                  */
421                 WARN_ON(req->sk);
422
423                 if (seq != tcp_rsk(req)->snt_isn) {
424                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
425                         goto out;
426                 }
427
428                 /*
429                  * Still in SYN_RECV, just remove it silently.
430                  * There is no good way to pass the error to the newly
431                  * created socket, and POSIX does not want network
432                  * errors returned from accept().
433                  */
434                 inet_csk_reqsk_queue_drop(sk, req, prev);
435                 goto out;
436
437         case TCP_SYN_SENT:
438         case TCP_SYN_RECV:  /* Cannot happen.
439                                It can f.e. if SYNs crossed.
440                              */
441                 if (!sock_owned_by_user(sk)) {
442                         sk->sk_err = err;
443
444                         sk->sk_error_report(sk);
445
446                         tcp_done(sk);
447                 } else {
448                         sk->sk_err_soft = err;
449                 }
450                 goto out;
451         }
452
453         /* If we've already connected we will keep trying
454          * until we time out, or the user gives up.
455          *
456          * rfc1122 4.2.3.9 allows to consider as hard errors
457          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458          * but it is obsoleted by pmtu discovery).
459          *
460          * Note, that in modern internet, where routing is unreliable
461          * and in each dark corner broken firewalls sit, sending random
462          * errors ordered by their masters even this two messages finally lose
463          * their original sense (even Linux sends invalid PORT_UNREACHs)
464          *
465          * Now we are in compliance with RFCs.
466          *                                                      --ANK (980905)
467          */
468
469         inet = inet_sk(sk);
470         if (!sock_owned_by_user(sk) && inet->recverr) {
471                 sk->sk_err = err;
472                 sk->sk_error_report(sk);
473         } else  { /* Only an error on timeout */
474                 sk->sk_err_soft = err;
475         }
476
477 out:
478         bh_unlock_sock(sk);
479         sock_put(sk);
480 }
481
482 /* This routine computes an IPv4 TCP checksum. */
483 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
484 {
485         struct inet_sock *inet = inet_sk(sk);
486         struct tcphdr *th = tcp_hdr(skb);
487
488         if (skb->ip_summed == CHECKSUM_PARTIAL) {
489                 th->check = ~tcp_v4_check(len, inet->saddr,
490                                           inet->daddr, 0);
491                 skb->csum_start = skb_transport_header(skb) - skb->head;
492                 skb->csum_offset = offsetof(struct tcphdr, check);
493         } else {
494                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495                                          csum_partial((char *)th,
496                                                       th->doff << 2,
497                                                       skb->csum));
498         }
499 }
500
501 int tcp_v4_gso_send_check(struct sk_buff *skb)
502 {
503         const struct iphdr *iph;
504         struct tcphdr *th;
505
506         if (!pskb_may_pull(skb, sizeof(*th)))
507                 return -EINVAL;
508
509         iph = ip_hdr(skb);
510         th = tcp_hdr(skb);
511
512         th->check = 0;
513         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514         skb->csum_start = skb_transport_header(skb) - skb->head;
515         skb->csum_offset = offsetof(struct tcphdr, check);
516         skb->ip_summed = CHECKSUM_PARTIAL;
517         return 0;
518 }
519
520 /*
521  *      This routine will send an RST to the other tcp.
522  *
523  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524  *                    for reset.
525  *      Answer: if a packet caused RST, it is not for a socket
526  *              existing in our system, if it is matched to a socket,
527  *              it is just duplicate segment or bug in other side's TCP.
528  *              So that we build reply only basing on parameters
529  *              arrived with segment.
530  *      Exception: precedence violation. We do not implement it in any case.
531  */
532
533 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
534 {
535         struct tcphdr *th = tcp_hdr(skb);
536         struct {
537                 struct tcphdr th;
538 #ifdef CONFIG_TCP_MD5SIG
539                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
540 #endif
541         } rep;
542         struct ip_reply_arg arg;
543 #ifdef CONFIG_TCP_MD5SIG
544         struct tcp_md5sig_key *key;
545 #endif
546         struct net *net;
547
548         /* Never send a reset in response to a reset. */
549         if (th->rst)
550                 return;
551
552         if (skb->rtable->rt_type != RTN_LOCAL)
553                 return;
554
555         /* Swap the send and the receive. */
556         memset(&rep, 0, sizeof(rep));
557         rep.th.dest   = th->source;
558         rep.th.source = th->dest;
559         rep.th.doff   = sizeof(struct tcphdr) / 4;
560         rep.th.rst    = 1;
561
562         if (th->ack) {
563                 rep.th.seq = th->ack_seq;
564         } else {
565                 rep.th.ack = 1;
566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567                                        skb->len - (th->doff << 2));
568         }
569
570         memset(&arg, 0, sizeof(arg));
571         arg.iov[0].iov_base = (unsigned char *)&rep;
572         arg.iov[0].iov_len  = sizeof(rep.th);
573
574 #ifdef CONFIG_TCP_MD5SIG
575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576         if (key) {
577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578                                    (TCPOPT_NOP << 16) |
579                                    (TCPOPT_MD5SIG << 8) |
580                                    TCPOLEN_MD5SIG);
581                 /* Update length and the length the header thinks exists */
582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583                 rep.th.doff = arg.iov[0].iov_len / 4;
584
585                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586                                      key, ip_hdr(skb)->daddr,
587                                      ip_hdr(skb)->saddr, &rep.th);
588         }
589 #endif
590         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591                                       ip_hdr(skb)->saddr, /* XXX */
592                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
593         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594
595         net = dev_net(skb->dst->dev);
596         ip_send_reply(net->ipv4.tcp_sock, skb,
597                       &arg, arg.iov[0].iov_len);
598
599         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
600         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
601 }
602
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604    outside socket context is ugly, certainly. What can I do?
605  */
606
607 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608                             u32 win, u32 ts, int oif,
609                             struct tcp_md5sig_key *key)
610 {
611         struct tcphdr *th = tcp_hdr(skb);
612         struct {
613                 struct tcphdr th;
614                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617 #endif
618                         ];
619         } rep;
620         struct ip_reply_arg arg;
621         struct net *net = dev_net(skb->dev);
622
623         memset(&rep.th, 0, sizeof(struct tcphdr));
624         memset(&arg, 0, sizeof(arg));
625
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628         if (ts) {
629                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
630                                    (TCPOPT_TIMESTAMP << 8) |
631                                    TCPOLEN_TIMESTAMP);
632                 rep.opt[1] = htonl(tcp_time_stamp);
633                 rep.opt[2] = htonl(ts);
634                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
635         }
636
637         /* Swap the send and the receive. */
638         rep.th.dest    = th->source;
639         rep.th.source  = th->dest;
640         rep.th.doff    = arg.iov[0].iov_len / 4;
641         rep.th.seq     = htonl(seq);
642         rep.th.ack_seq = htonl(ack);
643         rep.th.ack     = 1;
644         rep.th.window  = htons(win);
645
646 #ifdef CONFIG_TCP_MD5SIG
647         if (key) {
648                 int offset = (ts) ? 3 : 0;
649
650                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
651                                           (TCPOPT_NOP << 16) |
652                                           (TCPOPT_MD5SIG << 8) |
653                                           TCPOLEN_MD5SIG);
654                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
655                 rep.th.doff = arg.iov[0].iov_len/4;
656
657                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
658                                     key, ip_hdr(skb)->saddr,
659                                     ip_hdr(skb)->daddr, &rep.th);
660         }
661 #endif
662         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663                                       ip_hdr(skb)->saddr, /* XXX */
664                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
665         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666         if (oif)
667                 arg.bound_dev_if = oif;
668
669         ip_send_reply(net->ipv4.tcp_sock, skb,
670                       &arg, arg.iov[0].iov_len);
671
672         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673 }
674
675 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676 {
677         struct inet_timewait_sock *tw = inet_twsk(sk);
678         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679
680         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682                         tcptw->tw_ts_recent,
683                         tw->tw_bound_dev_if,
684                         tcp_twsk_md5_key(tcptw)
685                         );
686
687         inet_twsk_put(tw);
688 }
689
690 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
691                                   struct request_sock *req)
692 {
693         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
694                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695                         req->ts_recent,
696                         0,
697                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr));
698 }
699
700 /*
701  *      Send a SYN-ACK after having received a SYN.
702  *      This still operates on a request_sock only, not on a big
703  *      socket.
704  */
705 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
706                                 struct dst_entry *dst)
707 {
708         const struct inet_request_sock *ireq = inet_rsk(req);
709         int err = -1;
710         struct sk_buff * skb;
711
712         /* First, grab a route. */
713         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
714                 return -1;
715
716         skb = tcp_make_synack(sk, dst, req);
717
718         if (skb) {
719                 struct tcphdr *th = tcp_hdr(skb);
720
721                 th->check = tcp_v4_check(skb->len,
722                                          ireq->loc_addr,
723                                          ireq->rmt_addr,
724                                          csum_partial((char *)th, skb->len,
725                                                       skb->csum));
726
727                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
728                                             ireq->rmt_addr,
729                                             ireq->opt);
730                 err = net_xmit_eval(err);
731         }
732
733         dst_release(dst);
734         return err;
735 }
736
737 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
738 {
739         return __tcp_v4_send_synack(sk, req, NULL);
740 }
741
742 /*
743  *      IPv4 request_sock destructor.
744  */
745 static void tcp_v4_reqsk_destructor(struct request_sock *req)
746 {
747         kfree(inet_rsk(req)->opt);
748 }
749
750 #ifdef CONFIG_SYN_COOKIES
751 static void syn_flood_warning(struct sk_buff *skb)
752 {
753         static unsigned long warntime;
754
755         if (time_after(jiffies, (warntime + HZ * 60))) {
756                 warntime = jiffies;
757                 printk(KERN_INFO
758                        "possible SYN flooding on port %d. Sending cookies.\n",
759                        ntohs(tcp_hdr(skb)->dest));
760         }
761 }
762 #endif
763
764 /*
765  * Save and compile IPv4 options into the request_sock if needed.
766  */
767 static struct ip_options *tcp_v4_save_options(struct sock *sk,
768                                               struct sk_buff *skb)
769 {
770         struct ip_options *opt = &(IPCB(skb)->opt);
771         struct ip_options *dopt = NULL;
772
773         if (opt && opt->optlen) {
774                 int opt_size = optlength(opt);
775                 dopt = kmalloc(opt_size, GFP_ATOMIC);
776                 if (dopt) {
777                         if (ip_options_echo(dopt, skb)) {
778                                 kfree(dopt);
779                                 dopt = NULL;
780                         }
781                 }
782         }
783         return dopt;
784 }
785
786 #ifdef CONFIG_TCP_MD5SIG
787 /*
788  * RFC2385 MD5 checksumming requires a mapping of
789  * IP address->MD5 Key.
790  * We need to maintain these in the sk structure.
791  */
792
793 /* Find the Key structure for an address.  */
794 static struct tcp_md5sig_key *
795                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
796 {
797         struct tcp_sock *tp = tcp_sk(sk);
798         int i;
799
800         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
801                 return NULL;
802         for (i = 0; i < tp->md5sig_info->entries4; i++) {
803                 if (tp->md5sig_info->keys4[i].addr == addr)
804                         return &tp->md5sig_info->keys4[i].base;
805         }
806         return NULL;
807 }
808
809 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
810                                          struct sock *addr_sk)
811 {
812         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
813 }
814
815 EXPORT_SYMBOL(tcp_v4_md5_lookup);
816
817 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
818                                                       struct request_sock *req)
819 {
820         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
821 }
822
823 /* This can be called on a newly created socket, from other files */
824 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
825                       u8 *newkey, u8 newkeylen)
826 {
827         /* Add Key to the list */
828         struct tcp_md5sig_key *key;
829         struct tcp_sock *tp = tcp_sk(sk);
830         struct tcp4_md5sig_key *keys;
831
832         key = tcp_v4_md5_do_lookup(sk, addr);
833         if (key) {
834                 /* Pre-existing entry - just update that one. */
835                 kfree(key->key);
836                 key->key = newkey;
837                 key->keylen = newkeylen;
838         } else {
839                 struct tcp_md5sig_info *md5sig;
840
841                 if (!tp->md5sig_info) {
842                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
843                                                   GFP_ATOMIC);
844                         if (!tp->md5sig_info) {
845                                 kfree(newkey);
846                                 return -ENOMEM;
847                         }
848                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
849                 }
850                 if (tcp_alloc_md5sig_pool() == NULL) {
851                         kfree(newkey);
852                         return -ENOMEM;
853                 }
854                 md5sig = tp->md5sig_info;
855
856                 if (md5sig->alloced4 == md5sig->entries4) {
857                         keys = kmalloc((sizeof(*keys) *
858                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
859                         if (!keys) {
860                                 kfree(newkey);
861                                 tcp_free_md5sig_pool();
862                                 return -ENOMEM;
863                         }
864
865                         if (md5sig->entries4)
866                                 memcpy(keys, md5sig->keys4,
867                                        sizeof(*keys) * md5sig->entries4);
868
869                         /* Free old key list, and reference new one */
870                         kfree(md5sig->keys4);
871                         md5sig->keys4 = keys;
872                         md5sig->alloced4++;
873                 }
874                 md5sig->entries4++;
875                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
876                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
877                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
878         }
879         return 0;
880 }
881
882 EXPORT_SYMBOL(tcp_v4_md5_do_add);
883
884 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
885                                u8 *newkey, u8 newkeylen)
886 {
887         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
888                                  newkey, newkeylen);
889 }
890
891 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
892 {
893         struct tcp_sock *tp = tcp_sk(sk);
894         int i;
895
896         for (i = 0; i < tp->md5sig_info->entries4; i++) {
897                 if (tp->md5sig_info->keys4[i].addr == addr) {
898                         /* Free the key */
899                         kfree(tp->md5sig_info->keys4[i].base.key);
900                         tp->md5sig_info->entries4--;
901
902                         if (tp->md5sig_info->entries4 == 0) {
903                                 kfree(tp->md5sig_info->keys4);
904                                 tp->md5sig_info->keys4 = NULL;
905                                 tp->md5sig_info->alloced4 = 0;
906                         } else if (tp->md5sig_info->entries4 != i) {
907                                 /* Need to do some manipulation */
908                                 memmove(&tp->md5sig_info->keys4[i],
909                                         &tp->md5sig_info->keys4[i+1],
910                                         (tp->md5sig_info->entries4 - i) *
911                                          sizeof(struct tcp4_md5sig_key));
912                         }
913                         tcp_free_md5sig_pool();
914                         return 0;
915                 }
916         }
917         return -ENOENT;
918 }
919
920 EXPORT_SYMBOL(tcp_v4_md5_do_del);
921
922 static void tcp_v4_clear_md5_list(struct sock *sk)
923 {
924         struct tcp_sock *tp = tcp_sk(sk);
925
926         /* Free each key, then the set of key keys,
927          * the crypto element, and then decrement our
928          * hold on the last resort crypto.
929          */
930         if (tp->md5sig_info->entries4) {
931                 int i;
932                 for (i = 0; i < tp->md5sig_info->entries4; i++)
933                         kfree(tp->md5sig_info->keys4[i].base.key);
934                 tp->md5sig_info->entries4 = 0;
935                 tcp_free_md5sig_pool();
936         }
937         if (tp->md5sig_info->keys4) {
938                 kfree(tp->md5sig_info->keys4);
939                 tp->md5sig_info->keys4 = NULL;
940                 tp->md5sig_info->alloced4  = 0;
941         }
942 }
943
944 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
945                                  int optlen)
946 {
947         struct tcp_md5sig cmd;
948         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
949         u8 *newkey;
950
951         if (optlen < sizeof(cmd))
952                 return -EINVAL;
953
954         if (copy_from_user(&cmd, optval, sizeof(cmd)))
955                 return -EFAULT;
956
957         if (sin->sin_family != AF_INET)
958                 return -EINVAL;
959
960         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
961                 if (!tcp_sk(sk)->md5sig_info)
962                         return -ENOENT;
963                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
964         }
965
966         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
967                 return -EINVAL;
968
969         if (!tcp_sk(sk)->md5sig_info) {
970                 struct tcp_sock *tp = tcp_sk(sk);
971                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
972
973                 if (!p)
974                         return -EINVAL;
975
976                 tp->md5sig_info = p;
977                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
978         }
979
980         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
981         if (!newkey)
982                 return -ENOMEM;
983         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
984                                  newkey, cmd.tcpm_keylen);
985 }
986
987 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
988                                         __be32 daddr, __be32 saddr, int nbytes)
989 {
990         struct tcp4_pseudohdr *bp;
991         struct scatterlist sg;
992
993         bp = &hp->md5_blk.ip4;
994
995         /*
996          * 1. the TCP pseudo-header (in the order: source IP address,
997          * destination IP address, zero-padded protocol number, and
998          * segment length)
999          */
1000         bp->saddr = saddr;
1001         bp->daddr = daddr;
1002         bp->pad = 0;
1003         bp->protocol = IPPROTO_TCP;
1004         bp->len = cpu_to_be16(nbytes);
1005
1006         sg_init_one(&sg, bp, sizeof(*bp));
1007         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1008 }
1009
1010 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1011                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1012 {
1013         struct tcp_md5sig_pool *hp;
1014         struct hash_desc *desc;
1015
1016         hp = tcp_get_md5sig_pool();
1017         if (!hp)
1018                 goto clear_hash_noput;
1019         desc = &hp->md5_desc;
1020
1021         if (crypto_hash_init(desc))
1022                 goto clear_hash;
1023         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1024                 goto clear_hash;
1025         if (tcp_md5_hash_header(hp, th))
1026                 goto clear_hash;
1027         if (tcp_md5_hash_key(hp, key))
1028                 goto clear_hash;
1029         if (crypto_hash_final(desc, md5_hash))
1030                 goto clear_hash;
1031
1032         tcp_put_md5sig_pool();
1033         return 0;
1034
1035 clear_hash:
1036         tcp_put_md5sig_pool();
1037 clear_hash_noput:
1038         memset(md5_hash, 0, 16);
1039         return 1;
1040 }
1041
1042 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1043                         struct sock *sk, struct request_sock *req,
1044                         struct sk_buff *skb)
1045 {
1046         struct tcp_md5sig_pool *hp;
1047         struct hash_desc *desc;
1048         struct tcphdr *th = tcp_hdr(skb);
1049         __be32 saddr, daddr;
1050
1051         if (sk) {
1052                 saddr = inet_sk(sk)->saddr;
1053                 daddr = inet_sk(sk)->daddr;
1054         } else if (req) {
1055                 saddr = inet_rsk(req)->loc_addr;
1056                 daddr = inet_rsk(req)->rmt_addr;
1057         } else {
1058                 const struct iphdr *iph = ip_hdr(skb);
1059                 saddr = iph->saddr;
1060                 daddr = iph->daddr;
1061         }
1062
1063         hp = tcp_get_md5sig_pool();
1064         if (!hp)
1065                 goto clear_hash_noput;
1066         desc = &hp->md5_desc;
1067
1068         if (crypto_hash_init(desc))
1069                 goto clear_hash;
1070
1071         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_header(hp, th))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_key(hp, key))
1078                 goto clear_hash;
1079         if (crypto_hash_final(desc, md5_hash))
1080                 goto clear_hash;
1081
1082         tcp_put_md5sig_pool();
1083         return 0;
1084
1085 clear_hash:
1086         tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088         memset(md5_hash, 0, 16);
1089         return 1;
1090 }
1091
1092 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1093
1094 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1095 {
1096         /*
1097          * This gets called for each TCP segment that arrives
1098          * so we want to be efficient.
1099          * We have 3 drop cases:
1100          * o No MD5 hash and one expected.
1101          * o MD5 hash and we're not expecting one.
1102          * o MD5 hash and its wrong.
1103          */
1104         __u8 *hash_location = NULL;
1105         struct tcp_md5sig_key *hash_expected;
1106         const struct iphdr *iph = ip_hdr(skb);
1107         struct tcphdr *th = tcp_hdr(skb);
1108         int genhash;
1109         unsigned char newhash[16];
1110
1111         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1112         hash_location = tcp_parse_md5sig_option(th);
1113
1114         /* We've parsed the options - do we have a hash? */
1115         if (!hash_expected && !hash_location)
1116                 return 0;
1117
1118         if (hash_expected && !hash_location) {
1119                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1120                 return 1;
1121         }
1122
1123         if (!hash_expected && hash_location) {
1124                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1125                 return 1;
1126         }
1127
1128         /* Okay, so this is hash_expected and hash_location -
1129          * so we need to calculate the checksum.
1130          */
1131         genhash = tcp_v4_md5_hash_skb(newhash,
1132                                       hash_expected,
1133                                       NULL, NULL, skb);
1134
1135         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1136                 if (net_ratelimit()) {
1137                         printk(KERN_INFO "MD5 Hash failed for "
1138                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1139                                NIPQUAD(iph->saddr), ntohs(th->source),
1140                                NIPQUAD(iph->daddr), ntohs(th->dest),
1141                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1142                 }
1143                 return 1;
1144         }
1145         return 0;
1146 }
1147
1148 #endif
1149
1150 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1151         .family         =       PF_INET,
1152         .obj_size       =       sizeof(struct tcp_request_sock),
1153         .rtx_syn_ack    =       tcp_v4_send_synack,
1154         .send_ack       =       tcp_v4_reqsk_send_ack,
1155         .destructor     =       tcp_v4_reqsk_destructor,
1156         .send_reset     =       tcp_v4_send_reset,
1157 };
1158
1159 #ifdef CONFIG_TCP_MD5SIG
1160 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1161         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1162 };
1163 #endif
1164
1165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1166         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1167         .twsk_unique    = tcp_twsk_unique,
1168         .twsk_destructor= tcp_twsk_destructor,
1169 };
1170
1171 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1172 {
1173         struct inet_request_sock *ireq;
1174         struct tcp_options_received tmp_opt;
1175         struct request_sock *req;
1176         __be32 saddr = ip_hdr(skb)->saddr;
1177         __be32 daddr = ip_hdr(skb)->daddr;
1178         __u32 isn = TCP_SKB_CB(skb)->when;
1179         struct dst_entry *dst = NULL;
1180 #ifdef CONFIG_SYN_COOKIES
1181         int want_cookie = 0;
1182 #else
1183 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1184 #endif
1185
1186         /* Never answer to SYNs send to broadcast or multicast */
1187         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1188                 goto drop;
1189
1190         /* TW buckets are converted to open requests without
1191          * limitations, they conserve resources and peer is
1192          * evidently real one.
1193          */
1194         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1195 #ifdef CONFIG_SYN_COOKIES
1196                 if (sysctl_tcp_syncookies) {
1197                         want_cookie = 1;
1198                 } else
1199 #endif
1200                 goto drop;
1201         }
1202
1203         /* Accept backlog is full. If we have already queued enough
1204          * of warm entries in syn queue, drop request. It is better than
1205          * clogging syn queue with openreqs with exponentially increasing
1206          * timeout.
1207          */
1208         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1209                 goto drop;
1210
1211         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1212         if (!req)
1213                 goto drop;
1214
1215 #ifdef CONFIG_TCP_MD5SIG
1216         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1217 #endif
1218
1219         tcp_clear_options(&tmp_opt);
1220         tmp_opt.mss_clamp = 536;
1221         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1222
1223         tcp_parse_options(skb, &tmp_opt, 0);
1224
1225         if (want_cookie && !tmp_opt.saw_tstamp)
1226                 tcp_clear_options(&tmp_opt);
1227
1228         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1229                 /* Some OSes (unknown ones, but I see them on web server, which
1230                  * contains information interesting only for windows'
1231                  * users) do not send their stamp in SYN. It is easy case.
1232                  * We simply do not advertise TS support.
1233                  */
1234                 tmp_opt.saw_tstamp = 0;
1235                 tmp_opt.tstamp_ok  = 0;
1236         }
1237         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1238
1239         tcp_openreq_init(req, &tmp_opt, skb);
1240
1241         if (security_inet_conn_request(sk, skb, req))
1242                 goto drop_and_free;
1243
1244         ireq = inet_rsk(req);
1245         ireq->loc_addr = daddr;
1246         ireq->rmt_addr = saddr;
1247         ireq->opt = tcp_v4_save_options(sk, skb);
1248         if (!want_cookie)
1249                 TCP_ECN_create_request(req, tcp_hdr(skb));
1250
1251         if (want_cookie) {
1252 #ifdef CONFIG_SYN_COOKIES
1253                 syn_flood_warning(skb);
1254                 req->cookie_ts = tmp_opt.tstamp_ok;
1255 #endif
1256                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1257         } else if (!isn) {
1258                 struct inet_peer *peer = NULL;
1259
1260                 /* VJ's idea. We save last timestamp seen
1261                  * from the destination in peer table, when entering
1262                  * state TIME-WAIT, and check against it before
1263                  * accepting new connection request.
1264                  *
1265                  * If "isn" is not zero, this request hit alive
1266                  * timewait bucket, so that all the necessary checks
1267                  * are made in the function processing timewait state.
1268                  */
1269                 if (tmp_opt.saw_tstamp &&
1270                     tcp_death_row.sysctl_tw_recycle &&
1271                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1272                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1273                     peer->v4daddr == saddr) {
1274                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1275                             (s32)(peer->tcp_ts - req->ts_recent) >
1276                                                         TCP_PAWS_WINDOW) {
1277                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1278                                 goto drop_and_release;
1279                         }
1280                 }
1281                 /* Kill the following clause, if you dislike this way. */
1282                 else if (!sysctl_tcp_syncookies &&
1283                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1284                           (sysctl_max_syn_backlog >> 2)) &&
1285                          (!peer || !peer->tcp_ts_stamp) &&
1286                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1287                         /* Without syncookies last quarter of
1288                          * backlog is filled with destinations,
1289                          * proven to be alive.
1290                          * It means that we continue to communicate
1291                          * to destinations, already remembered
1292                          * to the moment of synflood.
1293                          */
1294                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1295                                        "request from " NIPQUAD_FMT "/%u\n",
1296                                        NIPQUAD(saddr),
1297                                        ntohs(tcp_hdr(skb)->source));
1298                         goto drop_and_release;
1299                 }
1300
1301                 isn = tcp_v4_init_sequence(skb);
1302         }
1303         tcp_rsk(req)->snt_isn = isn;
1304
1305         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1306                 goto drop_and_free;
1307
1308         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1309         return 0;
1310
1311 drop_and_release:
1312         dst_release(dst);
1313 drop_and_free:
1314         reqsk_free(req);
1315 drop:
1316         return 0;
1317 }
1318
1319
1320 /*
1321  * The three way handshake has completed - we got a valid synack -
1322  * now create the new socket.
1323  */
1324 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1325                                   struct request_sock *req,
1326                                   struct dst_entry *dst)
1327 {
1328         struct inet_request_sock *ireq;
1329         struct inet_sock *newinet;
1330         struct tcp_sock *newtp;
1331         struct sock *newsk;
1332 #ifdef CONFIG_TCP_MD5SIG
1333         struct tcp_md5sig_key *key;
1334 #endif
1335
1336         if (sk_acceptq_is_full(sk))
1337                 goto exit_overflow;
1338
1339         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1340                 goto exit;
1341
1342         newsk = tcp_create_openreq_child(sk, req, skb);
1343         if (!newsk)
1344                 goto exit;
1345
1346         newsk->sk_gso_type = SKB_GSO_TCPV4;
1347         sk_setup_caps(newsk, dst);
1348
1349         newtp                 = tcp_sk(newsk);
1350         newinet               = inet_sk(newsk);
1351         ireq                  = inet_rsk(req);
1352         newinet->daddr        = ireq->rmt_addr;
1353         newinet->rcv_saddr    = ireq->loc_addr;
1354         newinet->saddr        = ireq->loc_addr;
1355         newinet->opt          = ireq->opt;
1356         ireq->opt             = NULL;
1357         newinet->mc_index     = inet_iif(skb);
1358         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1359         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1360         if (newinet->opt)
1361                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1362         newinet->id = newtp->write_seq ^ jiffies;
1363
1364         tcp_mtup_init(newsk);
1365         tcp_sync_mss(newsk, dst_mtu(dst));
1366         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1367         tcp_initialize_rcv_mss(newsk);
1368
1369 #ifdef CONFIG_TCP_MD5SIG
1370         /* Copy over the MD5 key from the original socket */
1371         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1372                 /*
1373                  * We're using one, so create a matching key
1374                  * on the newsk structure. If we fail to get
1375                  * memory, then we end up not copying the key
1376                  * across. Shucks.
1377                  */
1378                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1379                 if (newkey != NULL)
1380                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1381                                           newkey, key->keylen);
1382                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1383         }
1384 #endif
1385
1386         __inet_hash_nolisten(newsk);
1387         __inet_inherit_port(sk, newsk);
1388
1389         return newsk;
1390
1391 exit_overflow:
1392         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1393 exit:
1394         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1395         dst_release(dst);
1396         return NULL;
1397 }
1398
1399 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1400 {
1401         struct tcphdr *th = tcp_hdr(skb);
1402         const struct iphdr *iph = ip_hdr(skb);
1403         struct sock *nsk;
1404         struct request_sock **prev;
1405         /* Find possible connection requests. */
1406         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1407                                                        iph->saddr, iph->daddr);
1408         if (req)
1409                 return tcp_check_req(sk, skb, req, prev);
1410
1411         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1412                         th->source, iph->daddr, th->dest, inet_iif(skb));
1413
1414         if (nsk) {
1415                 if (nsk->sk_state != TCP_TIME_WAIT) {
1416                         bh_lock_sock(nsk);
1417                         return nsk;
1418                 }
1419                 inet_twsk_put(inet_twsk(nsk));
1420                 return NULL;
1421         }
1422
1423 #ifdef CONFIG_SYN_COOKIES
1424         if (!th->rst && !th->syn && th->ack)
1425                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1426 #endif
1427         return sk;
1428 }
1429
1430 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1431 {
1432         const struct iphdr *iph = ip_hdr(skb);
1433
1434         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1435                 if (!tcp_v4_check(skb->len, iph->saddr,
1436                                   iph->daddr, skb->csum)) {
1437                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1438                         return 0;
1439                 }
1440         }
1441
1442         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1443                                        skb->len, IPPROTO_TCP, 0);
1444
1445         if (skb->len <= 76) {
1446                 return __skb_checksum_complete(skb);
1447         }
1448         return 0;
1449 }
1450
1451
1452 /* The socket must have it's spinlock held when we get
1453  * here.
1454  *
1455  * We have a potential double-lock case here, so even when
1456  * doing backlog processing we use the BH locking scheme.
1457  * This is because we cannot sleep with the original spinlock
1458  * held.
1459  */
1460 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1461 {
1462         struct sock *rsk;
1463 #ifdef CONFIG_TCP_MD5SIG
1464         /*
1465          * We really want to reject the packet as early as possible
1466          * if:
1467          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1468          *  o There is an MD5 option and we're not expecting one
1469          */
1470         if (tcp_v4_inbound_md5_hash(sk, skb))
1471                 goto discard;
1472 #endif
1473
1474         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1475                 TCP_CHECK_TIMER(sk);
1476                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1477                         rsk = sk;
1478                         goto reset;
1479                 }
1480                 TCP_CHECK_TIMER(sk);
1481                 return 0;
1482         }
1483
1484         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1485                 goto csum_err;
1486
1487         if (sk->sk_state == TCP_LISTEN) {
1488                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1489                 if (!nsk)
1490                         goto discard;
1491
1492                 if (nsk != sk) {
1493                         if (tcp_child_process(sk, nsk, skb)) {
1494                                 rsk = nsk;
1495                                 goto reset;
1496                         }
1497                         return 0;
1498                 }
1499         }
1500
1501         TCP_CHECK_TIMER(sk);
1502         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1503                 rsk = sk;
1504                 goto reset;
1505         }
1506         TCP_CHECK_TIMER(sk);
1507         return 0;
1508
1509 reset:
1510         tcp_v4_send_reset(rsk, skb);
1511 discard:
1512         kfree_skb(skb);
1513         /* Be careful here. If this function gets more complicated and
1514          * gcc suffers from register pressure on the x86, sk (in %ebx)
1515          * might be destroyed here. This current version compiles correctly,
1516          * but you have been warned.
1517          */
1518         return 0;
1519
1520 csum_err:
1521         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1522         goto discard;
1523 }
1524
1525 /*
1526  *      From tcp_input.c
1527  */
1528
1529 int tcp_v4_rcv(struct sk_buff *skb)
1530 {
1531         const struct iphdr *iph;
1532         struct tcphdr *th;
1533         struct sock *sk;
1534         int ret;
1535         struct net *net = dev_net(skb->dev);
1536
1537         if (skb->pkt_type != PACKET_HOST)
1538                 goto discard_it;
1539
1540         /* Count it even if it's bad */
1541         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1542
1543         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1544                 goto discard_it;
1545
1546         th = tcp_hdr(skb);
1547
1548         if (th->doff < sizeof(struct tcphdr) / 4)
1549                 goto bad_packet;
1550         if (!pskb_may_pull(skb, th->doff * 4))
1551                 goto discard_it;
1552
1553         /* An explanation is required here, I think.
1554          * Packet length and doff are validated by header prediction,
1555          * provided case of th->doff==0 is eliminated.
1556          * So, we defer the checks. */
1557         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1558                 goto bad_packet;
1559
1560         th = tcp_hdr(skb);
1561         iph = ip_hdr(skb);
1562         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1563         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1564                                     skb->len - th->doff * 4);
1565         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1566         TCP_SKB_CB(skb)->when    = 0;
1567         TCP_SKB_CB(skb)->flags   = iph->tos;
1568         TCP_SKB_CB(skb)->sacked  = 0;
1569
1570         sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1571                         th->source, iph->daddr, th->dest, inet_iif(skb));
1572         if (!sk)
1573                 goto no_tcp_socket;
1574
1575 process:
1576         if (sk->sk_state == TCP_TIME_WAIT)
1577                 goto do_time_wait;
1578
1579         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1580                 goto discard_and_relse;
1581         nf_reset(skb);
1582
1583         if (sk_filter(sk, skb))
1584                 goto discard_and_relse;
1585
1586         skb->dev = NULL;
1587
1588         bh_lock_sock_nested(sk);
1589         ret = 0;
1590         if (!sock_owned_by_user(sk)) {
1591 #ifdef CONFIG_NET_DMA
1592                 struct tcp_sock *tp = tcp_sk(sk);
1593                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1594                         tp->ucopy.dma_chan = get_softnet_dma();
1595                 if (tp->ucopy.dma_chan)
1596                         ret = tcp_v4_do_rcv(sk, skb);
1597                 else
1598 #endif
1599                 {
1600                         if (!tcp_prequeue(sk, skb))
1601                         ret = tcp_v4_do_rcv(sk, skb);
1602                 }
1603         } else
1604                 sk_add_backlog(sk, skb);
1605         bh_unlock_sock(sk);
1606
1607         sock_put(sk);
1608
1609         return ret;
1610
1611 no_tcp_socket:
1612         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1613                 goto discard_it;
1614
1615         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1616 bad_packet:
1617                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1618         } else {
1619                 tcp_v4_send_reset(NULL, skb);
1620         }
1621
1622 discard_it:
1623         /* Discard frame. */
1624         kfree_skb(skb);
1625         return 0;
1626
1627 discard_and_relse:
1628         sock_put(sk);
1629         goto discard_it;
1630
1631 do_time_wait:
1632         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1633                 inet_twsk_put(inet_twsk(sk));
1634                 goto discard_it;
1635         }
1636
1637         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1638                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1639                 inet_twsk_put(inet_twsk(sk));
1640                 goto discard_it;
1641         }
1642         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1643         case TCP_TW_SYN: {
1644                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1645                                                         &tcp_hashinfo,
1646                                                         iph->daddr, th->dest,
1647                                                         inet_iif(skb));
1648                 if (sk2) {
1649                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1650                         inet_twsk_put(inet_twsk(sk));
1651                         sk = sk2;
1652                         goto process;
1653                 }
1654                 /* Fall through to ACK */
1655         }
1656         case TCP_TW_ACK:
1657                 tcp_v4_timewait_ack(sk, skb);
1658                 break;
1659         case TCP_TW_RST:
1660                 goto no_tcp_socket;
1661         case TCP_TW_SUCCESS:;
1662         }
1663         goto discard_it;
1664 }
1665
1666 /* VJ's idea. Save last timestamp seen from this destination
1667  * and hold it at least for normal timewait interval to use for duplicate
1668  * segment detection in subsequent connections, before they enter synchronized
1669  * state.
1670  */
1671
1672 int tcp_v4_remember_stamp(struct sock *sk)
1673 {
1674         struct inet_sock *inet = inet_sk(sk);
1675         struct tcp_sock *tp = tcp_sk(sk);
1676         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1677         struct inet_peer *peer = NULL;
1678         int release_it = 0;
1679
1680         if (!rt || rt->rt_dst != inet->daddr) {
1681                 peer = inet_getpeer(inet->daddr, 1);
1682                 release_it = 1;
1683         } else {
1684                 if (!rt->peer)
1685                         rt_bind_peer(rt, 1);
1686                 peer = rt->peer;
1687         }
1688
1689         if (peer) {
1690                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1691                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1692                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1693                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1694                         peer->tcp_ts = tp->rx_opt.ts_recent;
1695                 }
1696                 if (release_it)
1697                         inet_putpeer(peer);
1698                 return 1;
1699         }
1700
1701         return 0;
1702 }
1703
1704 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1705 {
1706         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1707
1708         if (peer) {
1709                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1710
1711                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1712                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1713                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1714                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1715                         peer->tcp_ts       = tcptw->tw_ts_recent;
1716                 }
1717                 inet_putpeer(peer);
1718                 return 1;
1719         }
1720
1721         return 0;
1722 }
1723
1724 struct inet_connection_sock_af_ops ipv4_specific = {
1725         .queue_xmit        = ip_queue_xmit,
1726         .send_check        = tcp_v4_send_check,
1727         .rebuild_header    = inet_sk_rebuild_header,
1728         .conn_request      = tcp_v4_conn_request,
1729         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1730         .remember_stamp    = tcp_v4_remember_stamp,
1731         .net_header_len    = sizeof(struct iphdr),
1732         .setsockopt        = ip_setsockopt,
1733         .getsockopt        = ip_getsockopt,
1734         .addr2sockaddr     = inet_csk_addr2sockaddr,
1735         .sockaddr_len      = sizeof(struct sockaddr_in),
1736         .bind_conflict     = inet_csk_bind_conflict,
1737 #ifdef CONFIG_COMPAT
1738         .compat_setsockopt = compat_ip_setsockopt,
1739         .compat_getsockopt = compat_ip_getsockopt,
1740 #endif
1741 };
1742
1743 #ifdef CONFIG_TCP_MD5SIG
1744 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1745         .md5_lookup             = tcp_v4_md5_lookup,
1746         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1747         .md5_add                = tcp_v4_md5_add_func,
1748         .md5_parse              = tcp_v4_parse_md5_keys,
1749 };
1750 #endif
1751
1752 /* NOTE: A lot of things set to zero explicitly by call to
1753  *       sk_alloc() so need not be done here.
1754  */
1755 static int tcp_v4_init_sock(struct sock *sk)
1756 {
1757         struct inet_connection_sock *icsk = inet_csk(sk);
1758         struct tcp_sock *tp = tcp_sk(sk);
1759
1760         skb_queue_head_init(&tp->out_of_order_queue);
1761         tcp_init_xmit_timers(sk);
1762         tcp_prequeue_init(tp);
1763
1764         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1765         tp->mdev = TCP_TIMEOUT_INIT;
1766
1767         /* So many TCP implementations out there (incorrectly) count the
1768          * initial SYN frame in their delayed-ACK and congestion control
1769          * algorithms that we must have the following bandaid to talk
1770          * efficiently to them.  -DaveM
1771          */
1772         tp->snd_cwnd = 2;
1773
1774         /* See draft-stevens-tcpca-spec-01 for discussion of the
1775          * initialization of these values.
1776          */
1777         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1778         tp->snd_cwnd_clamp = ~0;
1779         tp->mss_cache = 536;
1780
1781         tp->reordering = sysctl_tcp_reordering;
1782         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1783
1784         sk->sk_state = TCP_CLOSE;
1785
1786         sk->sk_write_space = sk_stream_write_space;
1787         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1788
1789         icsk->icsk_af_ops = &ipv4_specific;
1790         icsk->icsk_sync_mss = tcp_sync_mss;
1791 #ifdef CONFIG_TCP_MD5SIG
1792         tp->af_specific = &tcp_sock_ipv4_specific;
1793 #endif
1794
1795         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1796         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1797
1798         atomic_inc(&tcp_sockets_allocated);
1799
1800         return 0;
1801 }
1802
1803 void tcp_v4_destroy_sock(struct sock *sk)
1804 {
1805         struct tcp_sock *tp = tcp_sk(sk);
1806
1807         tcp_clear_xmit_timers(sk);
1808
1809         tcp_cleanup_congestion_control(sk);
1810
1811         /* Cleanup up the write buffer. */
1812         tcp_write_queue_purge(sk);
1813
1814         /* Cleans up our, hopefully empty, out_of_order_queue. */
1815         __skb_queue_purge(&tp->out_of_order_queue);
1816
1817 #ifdef CONFIG_TCP_MD5SIG
1818         /* Clean up the MD5 key list, if any */
1819         if (tp->md5sig_info) {
1820                 tcp_v4_clear_md5_list(sk);
1821                 kfree(tp->md5sig_info);
1822                 tp->md5sig_info = NULL;
1823         }
1824 #endif
1825
1826 #ifdef CONFIG_NET_DMA
1827         /* Cleans up our sk_async_wait_queue */
1828         __skb_queue_purge(&sk->sk_async_wait_queue);
1829 #endif
1830
1831         /* Clean prequeue, it must be empty really */
1832         __skb_queue_purge(&tp->ucopy.prequeue);
1833
1834         /* Clean up a referenced TCP bind bucket. */
1835         if (inet_csk(sk)->icsk_bind_hash)
1836                 inet_put_port(sk);
1837
1838         /*
1839          * If sendmsg cached page exists, toss it.
1840          */
1841         if (sk->sk_sndmsg_page) {
1842                 __free_page(sk->sk_sndmsg_page);
1843                 sk->sk_sndmsg_page = NULL;
1844         }
1845
1846         atomic_dec(&tcp_sockets_allocated);
1847 }
1848
1849 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1850
1851 #ifdef CONFIG_PROC_FS
1852 /* Proc filesystem TCP sock list dumping. */
1853
1854 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1855 {
1856         return hlist_empty(head) ? NULL :
1857                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1858 }
1859
1860 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1861 {
1862         return tw->tw_node.next ?
1863                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1864 }
1865
1866 static void *listening_get_next(struct seq_file *seq, void *cur)
1867 {
1868         struct inet_connection_sock *icsk;
1869         struct hlist_node *node;
1870         struct sock *sk = cur;
1871         struct tcp_iter_state* st = seq->private;
1872         struct net *net = seq_file_net(seq);
1873
1874         if (!sk) {
1875                 st->bucket = 0;
1876                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1877                 goto get_sk;
1878         }
1879
1880         ++st->num;
1881
1882         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1883                 struct request_sock *req = cur;
1884
1885                 icsk = inet_csk(st->syn_wait_sk);
1886                 req = req->dl_next;
1887                 while (1) {
1888                         while (req) {
1889                                 if (req->rsk_ops->family == st->family) {
1890                                         cur = req;
1891                                         goto out;
1892                                 }
1893                                 req = req->dl_next;
1894                         }
1895                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1896                                 break;
1897 get_req:
1898                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1899                 }
1900                 sk        = sk_next(st->syn_wait_sk);
1901                 st->state = TCP_SEQ_STATE_LISTENING;
1902                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1903         } else {
1904                 icsk = inet_csk(sk);
1905                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1906                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1907                         goto start_req;
1908                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909                 sk = sk_next(sk);
1910         }
1911 get_sk:
1912         sk_for_each_from(sk, node) {
1913                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1914                         cur = sk;
1915                         goto out;
1916                 }
1917                 icsk = inet_csk(sk);
1918                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1919                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1920 start_req:
1921                         st->uid         = sock_i_uid(sk);
1922                         st->syn_wait_sk = sk;
1923                         st->state       = TCP_SEQ_STATE_OPENREQ;
1924                         st->sbucket     = 0;
1925                         goto get_req;
1926                 }
1927                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1928         }
1929         if (++st->bucket < INET_LHTABLE_SIZE) {
1930                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1931                 goto get_sk;
1932         }
1933         cur = NULL;
1934 out:
1935         return cur;
1936 }
1937
1938 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1939 {
1940         void *rc = listening_get_next(seq, NULL);
1941
1942         while (rc && *pos) {
1943                 rc = listening_get_next(seq, rc);
1944                 --*pos;
1945         }
1946         return rc;
1947 }
1948
1949 static void *established_get_first(struct seq_file *seq)
1950 {
1951         struct tcp_iter_state* st = seq->private;
1952         struct net *net = seq_file_net(seq);
1953         void *rc = NULL;
1954
1955         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1956                 struct sock *sk;
1957                 struct hlist_node *node;
1958                 struct inet_timewait_sock *tw;
1959                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960
1961                 read_lock_bh(lock);
1962                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1963                         if (sk->sk_family != st->family ||
1964                             !net_eq(sock_net(sk), net)) {
1965                                 continue;
1966                         }
1967                         rc = sk;
1968                         goto out;
1969                 }
1970                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1971                 inet_twsk_for_each(tw, node,
1972                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1973                         if (tw->tw_family != st->family ||
1974                             !net_eq(twsk_net(tw), net)) {
1975                                 continue;
1976                         }
1977                         rc = tw;
1978                         goto out;
1979                 }
1980                 read_unlock_bh(lock);
1981                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1982         }
1983 out:
1984         return rc;
1985 }
1986
1987 static void *established_get_next(struct seq_file *seq, void *cur)
1988 {
1989         struct sock *sk = cur;
1990         struct inet_timewait_sock *tw;
1991         struct hlist_node *node;
1992         struct tcp_iter_state* st = seq->private;
1993         struct net *net = seq_file_net(seq);
1994
1995         ++st->num;
1996
1997         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1998                 tw = cur;
1999                 tw = tw_next(tw);
2000 get_tw:
2001                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2002                         tw = tw_next(tw);
2003                 }
2004                 if (tw) {
2005                         cur = tw;
2006                         goto out;
2007                 }
2008                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2009                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010
2011                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2012                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2013                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2014                 } else {
2015                         cur = NULL;
2016                         goto out;
2017                 }
2018         } else
2019                 sk = sk_next(sk);
2020
2021         sk_for_each_from(sk, node) {
2022                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2023                         goto found;
2024         }
2025
2026         st->state = TCP_SEQ_STATE_TIME_WAIT;
2027         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2028         goto get_tw;
2029 found:
2030         cur = sk;
2031 out:
2032         return cur;
2033 }
2034
2035 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2036 {
2037         void *rc = established_get_first(seq);
2038
2039         while (rc && pos) {
2040                 rc = established_get_next(seq, rc);
2041                 --pos;
2042         }
2043         return rc;
2044 }
2045
2046 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2047 {
2048         void *rc;
2049         struct tcp_iter_state* st = seq->private;
2050
2051         inet_listen_lock(&tcp_hashinfo);
2052         st->state = TCP_SEQ_STATE_LISTENING;
2053         rc        = listening_get_idx(seq, &pos);
2054
2055         if (!rc) {
2056                 inet_listen_unlock(&tcp_hashinfo);
2057                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2058                 rc        = established_get_idx(seq, pos);
2059         }
2060
2061         return rc;
2062 }
2063
2064 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2065 {
2066         struct tcp_iter_state* st = seq->private;
2067         st->state = TCP_SEQ_STATE_LISTENING;
2068         st->num = 0;
2069         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2070 }
2071
2072 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2073 {
2074         void *rc = NULL;
2075         struct tcp_iter_state* st;
2076
2077         if (v == SEQ_START_TOKEN) {
2078                 rc = tcp_get_idx(seq, 0);
2079                 goto out;
2080         }
2081         st = seq->private;
2082
2083         switch (st->state) {
2084         case TCP_SEQ_STATE_OPENREQ:
2085         case TCP_SEQ_STATE_LISTENING:
2086                 rc = listening_get_next(seq, v);
2087                 if (!rc) {
2088                         inet_listen_unlock(&tcp_hashinfo);
2089                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2090                         rc        = established_get_first(seq);
2091                 }
2092                 break;
2093         case TCP_SEQ_STATE_ESTABLISHED:
2094         case TCP_SEQ_STATE_TIME_WAIT:
2095                 rc = established_get_next(seq, v);
2096                 break;
2097         }
2098 out:
2099         ++*pos;
2100         return rc;
2101 }
2102
2103 static void tcp_seq_stop(struct seq_file *seq, void *v)
2104 {
2105         struct tcp_iter_state* st = seq->private;
2106
2107         switch (st->state) {
2108         case TCP_SEQ_STATE_OPENREQ:
2109                 if (v) {
2110                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2111                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2112                 }
2113         case TCP_SEQ_STATE_LISTENING:
2114                 if (v != SEQ_START_TOKEN)
2115                         inet_listen_unlock(&tcp_hashinfo);
2116                 break;
2117         case TCP_SEQ_STATE_TIME_WAIT:
2118         case TCP_SEQ_STATE_ESTABLISHED:
2119                 if (v)
2120                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2121                 break;
2122         }
2123 }
2124
2125 static int tcp_seq_open(struct inode *inode, struct file *file)
2126 {
2127         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2128         struct tcp_iter_state *s;
2129         int err;
2130
2131         err = seq_open_net(inode, file, &afinfo->seq_ops,
2132                           sizeof(struct tcp_iter_state));
2133         if (err < 0)
2134                 return err;
2135
2136         s = ((struct seq_file *)file->private_data)->private;
2137         s->family               = afinfo->family;
2138         return 0;
2139 }
2140
2141 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2142 {
2143         int rc = 0;
2144         struct proc_dir_entry *p;
2145
2146         afinfo->seq_fops.open           = tcp_seq_open;
2147         afinfo->seq_fops.read           = seq_read;
2148         afinfo->seq_fops.llseek         = seq_lseek;
2149         afinfo->seq_fops.release        = seq_release_net;
2150
2151         afinfo->seq_ops.start           = tcp_seq_start;
2152         afinfo->seq_ops.next            = tcp_seq_next;
2153         afinfo->seq_ops.stop            = tcp_seq_stop;
2154
2155         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2156                              &afinfo->seq_fops, afinfo);
2157         if (!p)
2158                 rc = -ENOMEM;
2159         return rc;
2160 }
2161
2162 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2163 {
2164         proc_net_remove(net, afinfo->name);
2165 }
2166
2167 static void get_openreq4(struct sock *sk, struct request_sock *req,
2168                          struct seq_file *f, int i, int uid, int *len)
2169 {
2170         const struct inet_request_sock *ireq = inet_rsk(req);
2171         int ttd = req->expires - jiffies;
2172
2173         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2174                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2175                 i,
2176                 ireq->loc_addr,
2177                 ntohs(inet_sk(sk)->sport),
2178                 ireq->rmt_addr,
2179                 ntohs(ireq->rmt_port),
2180                 TCP_SYN_RECV,
2181                 0, 0, /* could print option size, but that is af dependent. */
2182                 1,    /* timers active (only the expire timer) */
2183                 jiffies_to_clock_t(ttd),
2184                 req->retrans,
2185                 uid,
2186                 0,  /* non standard timer */
2187                 0, /* open_requests have no inode */
2188                 atomic_read(&sk->sk_refcnt),
2189                 req,
2190                 len);
2191 }
2192
2193 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2194 {
2195         int timer_active;
2196         unsigned long timer_expires;
2197         struct tcp_sock *tp = tcp_sk(sk);
2198         const struct inet_connection_sock *icsk = inet_csk(sk);
2199         struct inet_sock *inet = inet_sk(sk);
2200         __be32 dest = inet->daddr;
2201         __be32 src = inet->rcv_saddr;
2202         __u16 destp = ntohs(inet->dport);
2203         __u16 srcp = ntohs(inet->sport);
2204
2205         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2206                 timer_active    = 1;
2207                 timer_expires   = icsk->icsk_timeout;
2208         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2209                 timer_active    = 4;
2210                 timer_expires   = icsk->icsk_timeout;
2211         } else if (timer_pending(&sk->sk_timer)) {
2212                 timer_active    = 2;
2213                 timer_expires   = sk->sk_timer.expires;
2214         } else {
2215                 timer_active    = 0;
2216                 timer_expires = jiffies;
2217         }
2218
2219         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2220                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2221                 i, src, srcp, dest, destp, sk->sk_state,
2222                 tp->write_seq - tp->snd_una,
2223                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2224                                              (tp->rcv_nxt - tp->copied_seq),
2225                 timer_active,
2226                 jiffies_to_clock_t(timer_expires - jiffies),
2227                 icsk->icsk_retransmits,
2228                 sock_i_uid(sk),
2229                 icsk->icsk_probes_out,
2230                 sock_i_ino(sk),
2231                 atomic_read(&sk->sk_refcnt), sk,
2232                 jiffies_to_clock_t(icsk->icsk_rto),
2233                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2234                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2235                 tp->snd_cwnd,
2236                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2237                 len);
2238 }
2239
2240 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2241                                struct seq_file *f, int i, int *len)
2242 {
2243         __be32 dest, src;
2244         __u16 destp, srcp;
2245         int ttd = tw->tw_ttd - jiffies;
2246
2247         if (ttd < 0)
2248                 ttd = 0;
2249
2250         dest  = tw->tw_daddr;
2251         src   = tw->tw_rcv_saddr;
2252         destp = ntohs(tw->tw_dport);
2253         srcp  = ntohs(tw->tw_sport);
2254
2255         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2256                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2257                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2258                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2259                 atomic_read(&tw->tw_refcnt), tw, len);
2260 }
2261
2262 #define TMPSZ 150
2263
2264 static int tcp4_seq_show(struct seq_file *seq, void *v)
2265 {
2266         struct tcp_iter_state* st;
2267         int len;
2268
2269         if (v == SEQ_START_TOKEN) {
2270                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2271                            "  sl  local_address rem_address   st tx_queue "
2272                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2273                            "inode");
2274                 goto out;
2275         }
2276         st = seq->private;
2277
2278         switch (st->state) {
2279         case TCP_SEQ_STATE_LISTENING:
2280         case TCP_SEQ_STATE_ESTABLISHED:
2281                 get_tcp4_sock(v, seq, st->num, &len);
2282                 break;
2283         case TCP_SEQ_STATE_OPENREQ:
2284                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2285                 break;
2286         case TCP_SEQ_STATE_TIME_WAIT:
2287                 get_timewait4_sock(v, seq, st->num, &len);
2288                 break;
2289         }
2290         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2291 out:
2292         return 0;
2293 }
2294
2295 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2296         .name           = "tcp",
2297         .family         = AF_INET,
2298         .seq_fops       = {
2299                 .owner          = THIS_MODULE,
2300         },
2301         .seq_ops        = {
2302                 .show           = tcp4_seq_show,
2303         },
2304 };
2305
2306 static int tcp4_proc_init_net(struct net *net)
2307 {
2308         return tcp_proc_register(net, &tcp4_seq_afinfo);
2309 }
2310
2311 static void tcp4_proc_exit_net(struct net *net)
2312 {
2313         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2314 }
2315
2316 static struct pernet_operations tcp4_net_ops = {
2317         .init = tcp4_proc_init_net,
2318         .exit = tcp4_proc_exit_net,
2319 };
2320
2321 int __init tcp4_proc_init(void)
2322 {
2323         return register_pernet_subsys(&tcp4_net_ops);
2324 }
2325
2326 void tcp4_proc_exit(void)
2327 {
2328         unregister_pernet_subsys(&tcp4_net_ops);
2329 }
2330 #endif /* CONFIG_PROC_FS */
2331
2332 struct proto tcp_prot = {
2333         .name                   = "TCP",
2334         .owner                  = THIS_MODULE,
2335         .close                  = tcp_close,
2336         .connect                = tcp_v4_connect,
2337         .disconnect             = tcp_disconnect,
2338         .accept                 = inet_csk_accept,
2339         .ioctl                  = tcp_ioctl,
2340         .init                   = tcp_v4_init_sock,
2341         .destroy                = tcp_v4_destroy_sock,
2342         .shutdown               = tcp_shutdown,
2343         .setsockopt             = tcp_setsockopt,
2344         .getsockopt             = tcp_getsockopt,
2345         .recvmsg                = tcp_recvmsg,
2346         .backlog_rcv            = tcp_v4_do_rcv,
2347         .hash                   = inet_hash,
2348         .unhash                 = inet_unhash,
2349         .get_port               = inet_csk_get_port,
2350         .enter_memory_pressure  = tcp_enter_memory_pressure,
2351         .sockets_allocated      = &tcp_sockets_allocated,
2352         .orphan_count           = &tcp_orphan_count,
2353         .memory_allocated       = &tcp_memory_allocated,
2354         .memory_pressure        = &tcp_memory_pressure,
2355         .sysctl_mem             = sysctl_tcp_mem,
2356         .sysctl_wmem            = sysctl_tcp_wmem,
2357         .sysctl_rmem            = sysctl_tcp_rmem,
2358         .max_header             = MAX_TCP_HEADER,
2359         .obj_size               = sizeof(struct tcp_sock),
2360         .twsk_prot              = &tcp_timewait_sock_ops,
2361         .rsk_prot               = &tcp_request_sock_ops,
2362         .h.hashinfo             = &tcp_hashinfo,
2363 #ifdef CONFIG_COMPAT
2364         .compat_setsockopt      = compat_tcp_setsockopt,
2365         .compat_getsockopt      = compat_tcp_getsockopt,
2366 #endif
2367 };
2368
2369
2370 static int __net_init tcp_sk_init(struct net *net)
2371 {
2372         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2373                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2374 }
2375
2376 static void __net_exit tcp_sk_exit(struct net *net)
2377 {
2378         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2379 }
2380
2381 static struct pernet_operations __net_initdata tcp_sk_ops = {
2382        .init = tcp_sk_init,
2383        .exit = tcp_sk_exit,
2384 };
2385
2386 void __init tcp_v4_init(void)
2387 {
2388         if (register_pernet_device(&tcp_sk_ops))
2389                 panic("Failed to create the TCP control socket.\n");
2390 }
2391
2392 EXPORT_SYMBOL(ipv4_specific);
2393 EXPORT_SYMBOL(tcp_hashinfo);
2394 EXPORT_SYMBOL(tcp_prot);
2395 EXPORT_SYMBOL(tcp_v4_conn_request);
2396 EXPORT_SYMBOL(tcp_v4_connect);
2397 EXPORT_SYMBOL(tcp_v4_do_rcv);
2398 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2399 EXPORT_SYMBOL(tcp_v4_send_check);
2400 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2401
2402 #ifdef CONFIG_PROC_FS
2403 EXPORT_SYMBOL(tcp_proc_register);
2404 EXPORT_SYMBOL(tcp_proc_unregister);
2405 #endif
2406 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2407