git.oblomov.eu Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen semantics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87
  88 /* Check TCP sequence numbers in ICMP packets. */
  89 #define ICMP_MIN_LENGTH 8
  90
  91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  95                                                    __be32 addr);
  96 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
  97                                    __be32 saddr, __be32 daddr,
  98                                    struct tcphdr *th, unsigned int tcplen);
  99 #else
 100 static inline
 101 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 102 {
 103         return NULL;
 104 }
 105 #endif
 106
 107 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 108         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 109         .lhash_users = ATOMIC_INIT(0),
 110         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 111 };
 112
 113 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 114 {
 115         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 116                                           ip_hdr(skb)->saddr,
 117                                           tcp_hdr(skb)->dest,
 118                                           tcp_hdr(skb)->source);
 119 }
 120
 121 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 122 {
 123         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 124         struct tcp_sock *tp = tcp_sk(sk);
 125
 126         /* With PAWS, it is safe from the viewpoint
 127            of data integrity. Even without PAWS it is safe provided sequence
 128            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 129
 130            Actually, the idea is close to VJ's one, only timestamp cache is
 131            held not per host, but per port pair and TW bucket is used as state
 132            holder.
 133
 134            If TW bucket has been already destroyed we fall back to VJ's scheme
 135            and use initial timestamp retrieved from peer table.
 136          */
 137         if (tcptw->tw_ts_recent_stamp &&
 138             (twp == NULL || (sysctl_tcp_tw_reuse &&
 139                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 140                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 141                 if (tp->write_seq == 0)
 142                         tp->write_seq = 1;
 143                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 144                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 145                 sock_hold(sktw);
 146                 return 1;
 147         }
 148
 149         return 0;
 150 }
 151
 152 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 153
 154 /* This will initiate an outgoing connection. */
 155 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 156 {
 157         struct inet_sock *inet = inet_sk(sk);
 158         struct tcp_sock *tp = tcp_sk(sk);
 159         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 160         struct rtable *rt;
 161         __be32 daddr, nexthop;
 162         int tmp;
 163         int err;
 164
 165         if (addr_len < sizeof(struct sockaddr_in))
 166                 return -EINVAL;
 167
 168         if (usin->sin_family != AF_INET)
 169                 return -EAFNOSUPPORT;
 170
 171         nexthop = daddr = usin->sin_addr.s_addr;
 172         if (inet->opt && inet->opt->srr) {
 173                 if (!daddr)
 174                         return -EINVAL;
 175                 nexthop = inet->opt->faddr;
 176         }
 177
 178         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 179                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 180                                IPPROTO_TCP,
 181                                inet->sport, usin->sin_port, sk, 1);
 182         if (tmp < 0) {
 183                 if (tmp == -ENETUNREACH)
 184                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 185                 return tmp;
 186         }
 187
 188         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 189                 ip_rt_put(rt);
 190                 return -ENETUNREACH;
 191         }
 192
 193         if (!inet->opt || !inet->opt->srr)
 194                 daddr = rt->rt_dst;
 195
 196         if (!inet->saddr)
 197                 inet->saddr = rt->rt_src;
 198         inet->rcv_saddr = inet->saddr;
 199
 200         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 201                 /* Reset inherited state */
 202                 tp->rx_opt.ts_recent       = 0;
 203                 tp->rx_opt.ts_recent_stamp = 0;
 204                 tp->write_seq              = 0;
 205         }
 206
 207         if (tcp_death_row.sysctl_tw_recycle &&
 208             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 209                 struct inet_peer *peer = rt_get_peer(rt);
 210                 /*
 211                  * VJ's idea. We save last timestamp seen from
 212                  * the destination in peer table, when entering state
 213                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 214                  * when trying new connection.
 215                  */
 216                 if (peer != NULL &&
 217                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 218                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 219                         tp->rx_opt.ts_recent = peer->tcp_ts;
 220                 }
 221         }
 222
 223         inet->dport = usin->sin_port;
 224         inet->daddr = daddr;
 225
 226         inet_csk(sk)->icsk_ext_hdr_len = 0;
 227         if (inet->opt)
 228                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 229
 230         tp->rx_opt.mss_clamp = 536;
 231
 232         /* Socket identity is still unknown (sport may be zero).
 233          * However we set state to SYN-SENT and not releasing socket
 234          * lock select source port, enter ourselves into the hash tables and
 235          * complete initialization after this.
 236          */
 237         tcp_set_state(sk, TCP_SYN_SENT);
 238         err = inet_hash_connect(&tcp_death_row, sk);
 239         if (err)
 240                 goto failure;
 241
 242         err = ip_route_newports(&rt, IPPROTO_TCP,
 243                                 inet->sport, inet->dport, sk);
 244         if (err)
 245                 goto failure;
 246
 247         /* OK, now commit destination to socket.  */
 248         sk->sk_gso_type = SKB_GSO_TCPV4;
 249         sk_setup_caps(sk, &rt->u.dst);
 250
 251         if (!tp->write_seq)
 252                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 253                                                            inet->daddr,
 254                                                            inet->sport,
 255                                                            usin->sin_port);
 256
 257         inet->id = tp->write_seq ^ jiffies;
 258
 259         err = tcp_connect(sk);
 260         rt = NULL;
 261         if (err)
 262                 goto failure;
 263
 264         return 0;
 265
 266 failure:
 267         /*
 268          * This unhashes the socket and releases the local port,
 269          * if necessary.
 270          */
 271         tcp_set_state(sk, TCP_CLOSE);
 272         ip_rt_put(rt);
 273         sk->sk_route_caps = 0;
 274         inet->dport = 0;
 275         return err;
 276 }
 277
 278 /*
 279  * This routine does path mtu discovery as defined in RFC1191.
 280  */
 281 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 282 {
 283         struct dst_entry *dst;
 284         struct inet_sock *inet = inet_sk(sk);
 285
 286         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 287          * send out by Linux are always <576bytes so they should go through
 288          * unfragmented).
 289          */
 290         if (sk->sk_state == TCP_LISTEN)
 291                 return;
 292
 293         /* We don't check in the destentry if pmtu discovery is forbidden
 294          * on this route. We just assume that no packet_to_big packets
 295          * are send back when pmtu discovery is not active.
 296          * There is a small race when the user changes this flag in the
 297          * route, but I think that's acceptable.
 298          */
 299         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 300                 return;
 301
 302         dst->ops->update_pmtu(dst, mtu);
 303
 304         /* Something is about to be wrong... Remember soft error
 305          * for the case, if this connection will not able to recover.
 306          */
 307         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 308                 sk->sk_err_soft = EMSGSIZE;
 309
 310         mtu = dst_mtu(dst);
 311
 312         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 313             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 314                 tcp_sync_mss(sk, mtu);
 315
 316                 /* Resend the TCP packet because it's
 317                  * clear that the old packet has been
 318                  * dropped. This is the new "fast" path mtu
 319                  * discovery.
 320                  */
 321                 tcp_simple_retransmit(sk);
 322         } /* else let the usual retransmit timer handle it */
 323 }
 324
 325 /*
 326  * This routine is called by the ICMP module when it gets some
 327  * sort of error condition.  If err < 0 then the socket should
 328  * be closed and the error returned to the user.  If err > 0
 329  * it's just the icmp type << 8 | icmp code.  After adjustment
 330  * header points to the first 8 bytes of the tcp header.  We need
 331  * to find the appropriate port.
 332  *
 333  * The locking strategy used here is very "optimistic". When
 334  * someone else accesses the socket the ICMP is just dropped
 335  * and for some paths there is no check at all.
 336  * A more general error queue to queue errors for later handling
 337  * is probably better.
 338  *
 339  */
 340
 341 void tcp_v4_err(struct sk_buff *skb, u32 info)
 342 {
 343         struct iphdr *iph = (struct iphdr *)skb->data;
 344         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 345         struct tcp_sock *tp;
 346         struct inet_sock *inet;
 347         const int type = icmp_hdr(skb)->type;
 348         const int code = icmp_hdr(skb)->code;
 349         struct sock *sk;
 350         __u32 seq;
 351         int err;
 352
 353         if (skb->len < (iph->ihl << 2) + 8) {
 354                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 355                 return;
 356         }
 357
 358         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
 359                         iph->saddr, th->source, inet_iif(skb));
 360         if (!sk) {
 361                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 362                 return;
 363         }
 364         if (sk->sk_state == TCP_TIME_WAIT) {
 365                 inet_twsk_put(inet_twsk(sk));
 366                 return;
 367         }
 368
 369         bh_lock_sock(sk);
 370         /* If too many ICMPs get dropped on busy
 371          * servers this needs to be solved differently.
 372          */
 373         if (sock_owned_by_user(sk))
 374                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 375
 376         if (sk->sk_state == TCP_CLOSE)
 377                 goto out;
 378
 379         tp = tcp_sk(sk);
 380         seq = ntohl(th->seq);
 381         if (sk->sk_state != TCP_LISTEN &&
 382             !between(seq, tp->snd_una, tp->snd_nxt)) {
 383                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 384                 goto out;
 385         }
 386
 387         switch (type) {
 388         case ICMP_SOURCE_QUENCH:
 389                 /* Just silently ignore these. */
 390                 goto out;
 391         case ICMP_PARAMETERPROB:
 392                 err = EPROTO;
 393                 break;
 394         case ICMP_DEST_UNREACH:
 395                 if (code > NR_ICMP_UNREACH)
 396                         goto out;
 397
 398                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 399                         if (!sock_owned_by_user(sk))
 400                                 do_pmtu_discovery(sk, iph, info);
 401                         goto out;
 402                 }
 403
 404                 err = icmp_err_convert[code].errno;
 405                 break;
 406         case ICMP_TIME_EXCEEDED:
 407                 err = EHOSTUNREACH;
 408                 break;
 409         default:
 410                 goto out;
 411         }
 412
 413         switch (sk->sk_state) {
 414                 struct request_sock *req, **prev;
 415         case TCP_LISTEN:
 416                 if (sock_owned_by_user(sk))
 417                         goto out;
 418
 419                 req = inet_csk_search_req(sk, &prev, th->dest,
 420                                           iph->daddr, iph->saddr);
 421                 if (!req)
 422                         goto out;
 423
 424                 /* ICMPs are not backlogged, hence we cannot get
 425                    an established socket here.
 426                  */
 427                 BUG_TRAP(!req->sk);
 428
 429                 if (seq != tcp_rsk(req)->snt_isn) {
 430                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 431                         goto out;
 432                 }
 433
 434                 /*
 435                  * Still in SYN_RECV, just remove it silently.
 436                  * There is no good way to pass the error to the newly
 437                  * created socket, and POSIX does not want network
 438                  * errors returned from accept().
 439                  */
 440                 inet_csk_reqsk_queue_drop(sk, req, prev);
 441                 goto out;
 442
 443         case TCP_SYN_SENT:
 444         case TCP_SYN_RECV:  /* Cannot happen.
 445                                It can f.e. if SYNs crossed.
 446                              */
 447                 if (!sock_owned_by_user(sk)) {
 448                         sk->sk_err = err;
 449
 450                         sk->sk_error_report(sk);
 451
 452                         tcp_done(sk);
 453                 } else {
 454                         sk->sk_err_soft = err;
 455                 }
 456                 goto out;
 457         }
 458
 459         /* If we've already connected we will keep trying
 460          * until we time out, or the user gives up.
 461          *
 462          * rfc1122 4.2.3.9 allows to consider as hard errors
 463          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 464          * but it is obsoleted by pmtu discovery).
 465          *
 466          * Note, that in modern internet, where routing is unreliable
 467          * and in each dark corner broken firewalls sit, sending random
 468          * errors ordered by their masters even this two messages finally lose
 469          * their original sense (even Linux sends invalid PORT_UNREACHs)
 470          *
 471          * Now we are in compliance with RFCs.
 472          *                                                      --ANK (980905)
 473          */
 474
 475         inet = inet_sk(sk);
 476         if (!sock_owned_by_user(sk) && inet->recverr) {
 477                 sk->sk_err = err;
 478                 sk->sk_error_report(sk);
 479         } else  { /* Only an error on timeout */
 480                 sk->sk_err_soft = err;
 481         }
 482
 483 out:
 484         bh_unlock_sock(sk);
 485         sock_put(sk);
 486 }
 487
 488 /* This routine computes an IPv4 TCP checksum. */
 489 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 490 {
 491         struct inet_sock *inet = inet_sk(sk);
 492         struct tcphdr *th = tcp_hdr(skb);
 493
 494         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 495                 th->check = ~tcp_v4_check(len, inet->saddr,
 496                                           inet->daddr, 0);
 497                 skb->csum_start = skb_transport_header(skb) - skb->head;
 498                 skb->csum_offset = offsetof(struct tcphdr, check);
 499         } else {
 500                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 501                                          csum_partial((char *)th,
 502                                                       th->doff << 2,
 503                                                       skb->csum));
 504         }
 505 }
 506
 507 int tcp_v4_gso_send_check(struct sk_buff *skb)
 508 {
 509         const struct iphdr *iph;
 510         struct tcphdr *th;
 511
 512         if (!pskb_may_pull(skb, sizeof(*th)))
 513                 return -EINVAL;
 514
 515         iph = ip_hdr(skb);
 516         th = tcp_hdr(skb);
 517
 518         th->check = 0;
 519         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 520         skb->csum_start = skb_transport_header(skb) - skb->head;
 521         skb->csum_offset = offsetof(struct tcphdr, check);
 522         skb->ip_summed = CHECKSUM_PARTIAL;
 523         return 0;
 524 }
 525
 526 /*
 527  *      This routine will send an RST to the other tcp.
 528  *
 529  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 530  *                    for reset.
 531  *      Answer: if a packet caused RST, it is not for a socket
 532  *              existing in our system, if it is matched to a socket,
 533  *              it is just duplicate segment or bug in other side's TCP.
 534  *              So that we build reply only basing on parameters
 535  *              arrived with segment.
 536  *      Exception: precedence violation. We do not implement it in any case.
 537  */
 538
 539 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 540 {
 541         struct tcphdr *th = tcp_hdr(skb);
 542         struct {
 543                 struct tcphdr th;
 544 #ifdef CONFIG_TCP_MD5SIG
 545                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 546 #endif
 547         } rep;
 548         struct ip_reply_arg arg;
 549 #ifdef CONFIG_TCP_MD5SIG
 550         struct tcp_md5sig_key *key;
 551 #endif
 552
 553         /* Never send a reset in response to a reset. */
 554         if (th->rst)
 555                 return;
 556
 557         if (skb->rtable->rt_type != RTN_LOCAL)
 558                 return;
 559
 560         /* Swap the send and the receive. */
 561         memset(&rep, 0, sizeof(rep));
 562         rep.th.dest   = th->source;
 563         rep.th.source = th->dest;
 564         rep.th.doff   = sizeof(struct tcphdr) / 4;
 565         rep.th.rst    = 1;
 566
 567         if (th->ack) {
 568                 rep.th.seq = th->ack_seq;
 569         } else {
 570                 rep.th.ack = 1;
 571                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 572                                        skb->len - (th->doff << 2));
 573         }
 574
 575         memset(&arg, 0, sizeof(arg));
 576         arg.iov[0].iov_base = (unsigned char *)&rep;
 577         arg.iov[0].iov_len  = sizeof(rep.th);
 578
 579 #ifdef CONFIG_TCP_MD5SIG
 580         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 581         if (key) {
 582                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 583                                    (TCPOPT_NOP << 16) |
 584                                    (TCPOPT_MD5SIG << 8) |
 585                                    TCPOLEN_MD5SIG);
 586                 /* Update length and the length the header thinks exists */
 587                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 588                 rep.th.doff = arg.iov[0].iov_len / 4;
 589
 590                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
 591                                         key,
 592                                         ip_hdr(skb)->daddr,
 593                                         ip_hdr(skb)->saddr,
 594                                         &rep.th, arg.iov[0].iov_len);
 595         }
 596 #endif
 597         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 598                                       ip_hdr(skb)->saddr, /* XXX */
 599                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 600         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 601
 602         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
 603                       &arg, arg.iov[0].iov_len);
 604
 605         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 606         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 607 }
 608
 609 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 610    outside socket context is ugly, certainly. What can I do?
 611  */
 612
 613 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 614                             u32 win, u32 ts, int oif,
 615                             struct tcp_md5sig_key *key)
 616 {
 617         struct tcphdr *th = tcp_hdr(skb);
 618         struct {
 619                 struct tcphdr th;
 620                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 621 #ifdef CONFIG_TCP_MD5SIG
 622                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 623 #endif
 624                         ];
 625         } rep;
 626         struct ip_reply_arg arg;
 627
 628         memset(&rep.th, 0, sizeof(struct tcphdr));
 629         memset(&arg, 0, sizeof(arg));
 630
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633         if (ts) {
 634                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 635                                    (TCPOPT_TIMESTAMP << 8) |
 636                                    TCPOLEN_TIMESTAMP);
 637                 rep.opt[1] = htonl(tcp_time_stamp);
 638                 rep.opt[2] = htonl(ts);
 639                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 640         }
 641
 642         /* Swap the send and the receive. */
 643         rep.th.dest    = th->source;
 644         rep.th.source  = th->dest;
 645         rep.th.doff    = arg.iov[0].iov_len / 4;
 646         rep.th.seq     = htonl(seq);
 647         rep.th.ack_seq = htonl(ack);
 648         rep.th.ack     = 1;
 649         rep.th.window  = htons(win);
 650
 651 #ifdef CONFIG_TCP_MD5SIG
 652         if (key) {
 653                 int offset = (ts) ? 3 : 0;
 654
 655                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 656                                           (TCPOPT_NOP << 16) |
 657                                           (TCPOPT_MD5SIG << 8) |
 658                                           TCPOLEN_MD5SIG);
 659                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 660                 rep.th.doff = arg.iov[0].iov_len/4;
 661
 662                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
 663                                         key,
 664                                         ip_hdr(skb)->daddr,
 665                                         ip_hdr(skb)->saddr,
 666                                         &rep.th, arg.iov[0].iov_len);
 667         }
 668 #endif
 669         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 670                                       ip_hdr(skb)->saddr, /* XXX */
 671                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 672         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 673         if (oif)
 674                 arg.bound_dev_if = oif;
 675
 676         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
 677                       &arg, arg.iov[0].iov_len);
 678
 679         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 680 }
 681
 682 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 683 {
 684         struct inet_timewait_sock *tw = inet_twsk(sk);
 685         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 686
 687         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 688                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 689                         tcptw->tw_ts_recent,
 690                         tw->tw_bound_dev_if,
 691                         tcp_twsk_md5_key(tcptw)
 692                         );
 693
 694         inet_twsk_put(tw);
 695 }
 696
 697 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
 698                                   struct request_sock *req)
 699 {
 700         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 701                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 702                         req->ts_recent,
 703                         0,
 704                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
 705 }
 706
 707 /*
 708  *      Send a SYN-ACK after having received a SYN.
 709  *      This still operates on a request_sock only, not on a big
 710  *      socket.
 711  */
 712 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 713                                 struct dst_entry *dst)
 714 {
 715         const struct inet_request_sock *ireq = inet_rsk(req);
 716         int err = -1;
 717         struct sk_buff * skb;
 718
 719         /* First, grab a route. */
 720         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 721                 return -1;
 722
 723         skb = tcp_make_synack(sk, dst, req);
 724
 725         if (skb) {
 726                 struct tcphdr *th = tcp_hdr(skb);
 727
 728                 th->check = tcp_v4_check(skb->len,
 729                                          ireq->loc_addr,
 730                                          ireq->rmt_addr,
 731                                          csum_partial((char *)th, skb->len,
 732                                                       skb->csum));
 733
 734                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 735                                             ireq->rmt_addr,
 736                                             ireq->opt);
 737                 err = net_xmit_eval(err);
 738         }
 739
 740         dst_release(dst);
 741         return err;
 742 }
 743
 744 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 745 {
 746         return __tcp_v4_send_synack(sk, req, NULL);
 747 }
 748
 749 /*
 750  *      IPv4 request_sock destructor.
 751  */
 752 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 753 {
 754         kfree(inet_rsk(req)->opt);
 755 }
 756
 757 #ifdef CONFIG_SYN_COOKIES
 758 static void syn_flood_warning(struct sk_buff *skb)
 759 {
 760         static unsigned long warntime;
 761
 762         if (time_after(jiffies, (warntime + HZ * 60))) {
 763                 warntime = jiffies;
 764                 printk(KERN_INFO
 765                        "possible SYN flooding on port %d. Sending cookies.\n",
 766                        ntohs(tcp_hdr(skb)->dest));
 767         }
 768 }
 769 #endif
 770
 771 /*
 772  * Save and compile IPv4 options into the request_sock if needed.
 773  */
 774 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 775                                               struct sk_buff *skb)
 776 {
 777         struct ip_options *opt = &(IPCB(skb)->opt);
 778         struct ip_options *dopt = NULL;
 779
 780         if (opt && opt->optlen) {
 781                 int opt_size = optlength(opt);
 782                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 783                 if (dopt) {
 784                         if (ip_options_echo(dopt, skb)) {
 785                                 kfree(dopt);
 786                                 dopt = NULL;
 787                         }
 788                 }
 789         }
 790         return dopt;
 791 }
 792
 793 #ifdef CONFIG_TCP_MD5SIG
 794 /*
 795  * RFC2385 MD5 checksumming requires a mapping of
 796  * IP address->MD5 Key.
 797  * We need to maintain these in the sk structure.
 798  */
 799
 800 /* Find the Key structure for an address.  */
 801 static struct tcp_md5sig_key *
 802                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 803 {
 804         struct tcp_sock *tp = tcp_sk(sk);
 805         int i;
 806
 807         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 808                 return NULL;
 809         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 810                 if (tp->md5sig_info->keys4[i].addr == addr)
 811                         return &tp->md5sig_info->keys4[i].base;
 812         }
 813         return NULL;
 814 }
 815
 816 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 817                                          struct sock *addr_sk)
 818 {
 819         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 820 }
 821
 822 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 823
 824 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 825                                                       struct request_sock *req)
 826 {
 827         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 828 }
 829
 830 /* This can be called on a newly created socket, from other files */
 831 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 832                       u8 *newkey, u8 newkeylen)
 833 {
 834         /* Add Key to the list */
 835         struct tcp_md5sig_key *key;
 836         struct tcp_sock *tp = tcp_sk(sk);
 837         struct tcp4_md5sig_key *keys;
 838
 839         key = tcp_v4_md5_do_lookup(sk, addr);
 840         if (key) {
 841                 /* Pre-existing entry - just update that one. */
 842                 kfree(key->key);
 843                 key->key = newkey;
 844                 key->keylen = newkeylen;
 845         } else {
 846                 struct tcp_md5sig_info *md5sig;
 847
 848                 if (!tp->md5sig_info) {
 849                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 850                                                   GFP_ATOMIC);
 851                         if (!tp->md5sig_info) {
 852                                 kfree(newkey);
 853                                 return -ENOMEM;
 854                         }
 855                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 856                 }
 857                 if (tcp_alloc_md5sig_pool() == NULL) {
 858                         kfree(newkey);
 859                         return -ENOMEM;
 860                 }
 861                 md5sig = tp->md5sig_info;
 862
 863                 if (md5sig->alloced4 == md5sig->entries4) {
 864                         keys = kmalloc((sizeof(*keys) *
 865                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 866                         if (!keys) {
 867                                 kfree(newkey);
 868                                 tcp_free_md5sig_pool();
 869                                 return -ENOMEM;
 870                         }
 871
 872                         if (md5sig->entries4)
 873                                 memcpy(keys, md5sig->keys4,
 874                                        sizeof(*keys) * md5sig->entries4);
 875
 876                         /* Free old key list, and reference new one */
 877                         kfree(md5sig->keys4);
 878                         md5sig->keys4 = keys;
 879                         md5sig->alloced4++;
 880                 }
 881                 md5sig->entries4++;
 882                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 883                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 884                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 885         }
 886         return 0;
 887 }
 888
 889 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 890
 891 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 892                                u8 *newkey, u8 newkeylen)
 893 {
 894         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 895                                  newkey, newkeylen);
 896 }
 897
 898 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 899 {
 900         struct tcp_sock *tp = tcp_sk(sk);
 901         int i;
 902
 903         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 904                 if (tp->md5sig_info->keys4[i].addr == addr) {
 905                         /* Free the key */
 906                         kfree(tp->md5sig_info->keys4[i].base.key);
 907                         tp->md5sig_info->entries4--;
 908
 909                         if (tp->md5sig_info->entries4 == 0) {
 910                                 kfree(tp->md5sig_info->keys4);
 911                                 tp->md5sig_info->keys4 = NULL;
 912                                 tp->md5sig_info->alloced4 = 0;
 913                         } else if (tp->md5sig_info->entries4 != i) {
 914                                 /* Need to do some manipulation */
 915                                 memmove(&tp->md5sig_info->keys4[i],
 916                                         &tp->md5sig_info->keys4[i+1],
 917                                         (tp->md5sig_info->entries4 - i) *
 918                                          sizeof(struct tcp4_md5sig_key));
 919                         }
 920                         tcp_free_md5sig_pool();
 921                         return 0;
 922                 }
 923         }
 924         return -ENOENT;
 925 }
 926
 927 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 928
 929 static void tcp_v4_clear_md5_list(struct sock *sk)
 930 {
 931         struct tcp_sock *tp = tcp_sk(sk);
 932
 933         /* Free each key, then the set of key keys,
 934          * the crypto element, and then decrement our
 935          * hold on the last resort crypto.
 936          */
 937         if (tp->md5sig_info->entries4) {
 938                 int i;
 939                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 940                         kfree(tp->md5sig_info->keys4[i].base.key);
 941                 tp->md5sig_info->entries4 = 0;
 942                 tcp_free_md5sig_pool();
 943         }
 944         if (tp->md5sig_info->keys4) {
 945                 kfree(tp->md5sig_info->keys4);
 946                 tp->md5sig_info->keys4 = NULL;
 947                 tp->md5sig_info->alloced4  = 0;
 948         }
 949 }
 950
 951 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 952                                  int optlen)
 953 {
 954         struct tcp_md5sig cmd;
 955         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 956         u8 *newkey;
 957
 958         if (optlen < sizeof(cmd))
 959                 return -EINVAL;
 960
 961         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 962                 return -EFAULT;
 963
 964         if (sin->sin_family != AF_INET)
 965                 return -EINVAL;
 966
 967         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 968                 if (!tcp_sk(sk)->md5sig_info)
 969                         return -ENOENT;
 970                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 971         }
 972
 973         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 974                 return -EINVAL;
 975
 976         if (!tcp_sk(sk)->md5sig_info) {
 977                 struct tcp_sock *tp = tcp_sk(sk);
 978                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 979
 980                 if (!p)
 981                         return -EINVAL;
 982
 983                 tp->md5sig_info = p;
 984                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 985         }
 986
 987         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
 988         if (!newkey)
 989                 return -ENOMEM;
 990         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
 991                                  newkey, cmd.tcpm_keylen);
 992 }
 993
 994 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
 995                                    __be32 saddr, __be32 daddr,
 996                                    struct tcphdr *th,
 997                                    unsigned int tcplen)
 998 {
 999         struct tcp_md5sig_pool *hp;
1000         struct tcp4_pseudohdr *bp;
1001         int err;
1002
1003         /*
1004          * Okay, so RFC2385 is turned on for this connection,
1005          * so we need to generate the MD5 hash for the packet now.
1006          */
1007
1008         hp = tcp_get_md5sig_pool();
1009         if (!hp)
1010                 goto clear_hash_noput;
1011
1012         bp = &hp->md5_blk.ip4;
1013
1014         /*
1015          * The TCP pseudo-header (in the order: source IP address,
1016          * destination IP address, zero-padded protocol number, and
1017          * segment length)
1018          */
1019         bp->saddr = saddr;
1020         bp->daddr = daddr;
1021         bp->pad = 0;
1022         bp->protocol = IPPROTO_TCP;
1023         bp->len = htons(tcplen);
1024
1025         err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp),
1026                                 th, tcplen, hp);
1027         if (err)
1028                 goto clear_hash;
1029
1030         /* Free up the crypto pool */
1031         tcp_put_md5sig_pool();
1032 out:
1033         return 0;
1034 clear_hash:
1035         tcp_put_md5sig_pool();
1036 clear_hash_noput:
1037         memset(md5_hash, 0, 16);
1038         goto out;
1039 }
1040
1041 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1042                          struct sock *sk,
1043                          struct dst_entry *dst,
1044                          struct request_sock *req,
1045                          struct tcphdr *th,
1046                          unsigned int tcplen)
1047 {
1048         __be32 saddr, daddr;
1049
1050         if (sk) {
1051                 saddr = inet_sk(sk)->saddr;
1052                 daddr = inet_sk(sk)->daddr;
1053         } else {
1054                 struct rtable *rt = (struct rtable *)dst;
1055                 BUG_ON(!rt);
1056                 saddr = rt->rt_src;
1057                 daddr = rt->rt_dst;
1058         }
1059         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1060                                        saddr, daddr,
1061                                        th, tcplen);
1062 }
1063
1064 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1065
1066 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1067 {
1068         /*
1069          * This gets called for each TCP segment that arrives
1070          * so we want to be efficient.
1071          * We have 3 drop cases:
1072          * o No MD5 hash and one expected.
1073          * o MD5 hash and we're not expecting one.
1074          * o MD5 hash and its wrong.
1075          */
1076         __u8 *hash_location = NULL;
1077         struct tcp_md5sig_key *hash_expected;
1078         const struct iphdr *iph = ip_hdr(skb);
1079         struct tcphdr *th = tcp_hdr(skb);
1080         int genhash;
1081         unsigned char newhash[16];
1082
1083         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1084         hash_location = tcp_parse_md5sig_option(th);
1085
1086         /* We've parsed the options - do we have a hash? */
1087         if (!hash_expected && !hash_location)
1088                 return 0;
1089
1090         if (hash_expected && !hash_location) {
1091                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1092                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1093                                NIPQUAD(iph->saddr), ntohs(th->source),
1094                                NIPQUAD(iph->daddr), ntohs(th->dest));
1095                 return 1;
1096         }
1097
1098         if (!hash_expected && hash_location) {
1099                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1100                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1101                                NIPQUAD(iph->saddr), ntohs(th->source),
1102                                NIPQUAD(iph->daddr), ntohs(th->dest));
1103                 return 1;
1104         }
1105
1106         /* Okay, so this is hash_expected and hash_location -
1107          * so we need to calculate the checksum.
1108          */
1109         genhash = tcp_v4_do_calc_md5_hash(newhash,
1110                                           hash_expected,
1111                                           iph->saddr, iph->daddr,
1112                                           th, skb->len);
1113
1114         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1115                 if (net_ratelimit()) {
1116                         printk(KERN_INFO "MD5 Hash failed for "
1117                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1118                                NIPQUAD(iph->saddr), ntohs(th->source),
1119                                NIPQUAD(iph->daddr), ntohs(th->dest),
1120                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1121                 }
1122                 return 1;
1123         }
1124         return 0;
1125 }
1126
1127 #endif
1128
1129 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1130         .family         =       PF_INET,
1131         .obj_size       =       sizeof(struct tcp_request_sock),
1132         .rtx_syn_ack    =       tcp_v4_send_synack,
1133         .send_ack       =       tcp_v4_reqsk_send_ack,
1134         .destructor     =       tcp_v4_reqsk_destructor,
1135         .send_reset     =       tcp_v4_send_reset,
1136 };
1137
1138 #ifdef CONFIG_TCP_MD5SIG
1139 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1140         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1141 };
1142 #endif
1143
1144 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1145         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1146         .twsk_unique    = tcp_twsk_unique,
1147         .twsk_destructor= tcp_twsk_destructor,
1148 };
1149
1150 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1151 {
1152         struct inet_request_sock *ireq;
1153         struct tcp_options_received tmp_opt;
1154         struct request_sock *req;
1155         __be32 saddr = ip_hdr(skb)->saddr;
1156         __be32 daddr = ip_hdr(skb)->daddr;
1157         __u32 isn = TCP_SKB_CB(skb)->when;
1158         struct dst_entry *dst = NULL;
1159 #ifdef CONFIG_SYN_COOKIES
1160         int want_cookie = 0;
1161 #else
1162 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1163 #endif
1164
1165         /* Never answer to SYNs send to broadcast or multicast */
1166         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1167                 goto drop;
1168
1169         /* TW buckets are converted to open requests without
1170          * limitations, they conserve resources and peer is
1171          * evidently real one.
1172          */
1173         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1174 #ifdef CONFIG_SYN_COOKIES
1175                 if (sysctl_tcp_syncookies) {
1176                         want_cookie = 1;
1177                 } else
1178 #endif
1179                 goto drop;
1180         }
1181
1182         /* Accept backlog is full. If we have already queued enough
1183          * of warm entries in syn queue, drop request. It is better than
1184          * clogging syn queue with openreqs with exponentially increasing
1185          * timeout.
1186          */
1187         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1188                 goto drop;
1189
1190         req = reqsk_alloc(&tcp_request_sock_ops);
1191         if (!req)
1192                 goto drop;
1193
1194 #ifdef CONFIG_TCP_MD5SIG
1195         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1196 #endif
1197
1198         tcp_clear_options(&tmp_opt);
1199         tmp_opt.mss_clamp = 536;
1200         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1201
1202         tcp_parse_options(skb, &tmp_opt, 0);
1203
1204         if (want_cookie && !tmp_opt.saw_tstamp)
1205                 tcp_clear_options(&tmp_opt);
1206
1207         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1208                 /* Some OSes (unknown ones, but I see them on web server, which
1209                  * contains information interesting only for windows'
1210                  * users) do not send their stamp in SYN. It is easy case.
1211                  * We simply do not advertise TS support.
1212                  */
1213                 tmp_opt.saw_tstamp = 0;
1214                 tmp_opt.tstamp_ok  = 0;
1215         }
1216         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1217
1218         tcp_openreq_init(req, &tmp_opt, skb);
1219
1220         if (security_inet_conn_request(sk, skb, req))
1221                 goto drop_and_free;
1222
1223         ireq = inet_rsk(req);
1224         ireq->loc_addr = daddr;
1225         ireq->rmt_addr = saddr;
1226         ireq->opt = tcp_v4_save_options(sk, skb);
1227         if (!want_cookie)
1228                 TCP_ECN_create_request(req, tcp_hdr(skb));
1229
1230         if (want_cookie) {
1231 #ifdef CONFIG_SYN_COOKIES
1232                 syn_flood_warning(skb);
1233                 req->cookie_ts = tmp_opt.tstamp_ok;
1234 #endif
1235                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1236         } else if (!isn) {
1237                 struct inet_peer *peer = NULL;
1238
1239                 /* VJ's idea. We save last timestamp seen
1240                  * from the destination in peer table, when entering
1241                  * state TIME-WAIT, and check against it before
1242                  * accepting new connection request.
1243                  *
1244                  * If "isn" is not zero, this request hit alive
1245                  * timewait bucket, so that all the necessary checks
1246                  * are made in the function processing timewait state.
1247                  */
1248                 if (tmp_opt.saw_tstamp &&
1249                     tcp_death_row.sysctl_tw_recycle &&
1250                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1251                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1252                     peer->v4daddr == saddr) {
1253                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1254                             (s32)(peer->tcp_ts - req->ts_recent) >
1255                                                         TCP_PAWS_WINDOW) {
1256                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1257                                 goto drop_and_release;
1258                         }
1259                 }
1260                 /* Kill the following clause, if you dislike this way. */
1261                 else if (!sysctl_tcp_syncookies &&
1262                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1263                           (sysctl_max_syn_backlog >> 2)) &&
1264                          (!peer || !peer->tcp_ts_stamp) &&
1265                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1266                         /* Without syncookies last quarter of
1267                          * backlog is filled with destinations,
1268                          * proven to be alive.
1269                          * It means that we continue to communicate
1270                          * to destinations, already remembered
1271                          * to the moment of synflood.
1272                          */
1273                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1274                                        "request from " NIPQUAD_FMT "/%u\n",
1275                                        NIPQUAD(saddr),
1276                                        ntohs(tcp_hdr(skb)->source));
1277                         goto drop_and_release;
1278                 }
1279
1280                 isn = tcp_v4_init_sequence(skb);
1281         }
1282         tcp_rsk(req)->snt_isn = isn;
1283
1284         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1285                 goto drop_and_free;
1286
1287         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1288         return 0;
1289
1290 drop_and_release:
1291         dst_release(dst);
1292 drop_and_free:
1293         reqsk_free(req);
1294 drop:
1295         return 0;
1296 }
1297
1298
1299 /*
1300  * The three way handshake has completed - we got a valid synack -
1301  * now create the new socket.
1302  */
1303 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1304                                   struct request_sock *req,
1305                                   struct dst_entry *dst)
1306 {
1307         struct inet_request_sock *ireq;
1308         struct inet_sock *newinet;
1309         struct tcp_sock *newtp;
1310         struct sock *newsk;
1311 #ifdef CONFIG_TCP_MD5SIG
1312         struct tcp_md5sig_key *key;
1313 #endif
1314
1315         if (sk_acceptq_is_full(sk))
1316                 goto exit_overflow;
1317
1318         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1319                 goto exit;
1320
1321         newsk = tcp_create_openreq_child(sk, req, skb);
1322         if (!newsk)
1323                 goto exit;
1324
1325         newsk->sk_gso_type = SKB_GSO_TCPV4;
1326         sk_setup_caps(newsk, dst);
1327
1328         newtp                 = tcp_sk(newsk);
1329         newinet               = inet_sk(newsk);
1330         ireq                  = inet_rsk(req);
1331         newinet->daddr        = ireq->rmt_addr;
1332         newinet->rcv_saddr    = ireq->loc_addr;
1333         newinet->saddr        = ireq->loc_addr;
1334         newinet->opt          = ireq->opt;
1335         ireq->opt             = NULL;
1336         newinet->mc_index     = inet_iif(skb);
1337         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1338         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1339         if (newinet->opt)
1340                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1341         newinet->id = newtp->write_seq ^ jiffies;
1342
1343         tcp_mtup_init(newsk);
1344         tcp_sync_mss(newsk, dst_mtu(dst));
1345         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1346         tcp_initialize_rcv_mss(newsk);
1347
1348 #ifdef CONFIG_TCP_MD5SIG
1349         /* Copy over the MD5 key from the original socket */
1350         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1351                 /*
1352                  * We're using one, so create a matching key
1353                  * on the newsk structure. If we fail to get
1354                  * memory, then we end up not copying the key
1355                  * across. Shucks.
1356                  */
1357                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1358                 if (newkey != NULL)
1359                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1360                                           newkey, key->keylen);
1361         }
1362 #endif
1363
1364         __inet_hash_nolisten(newsk);
1365         __inet_inherit_port(sk, newsk);
1366
1367         return newsk;
1368
1369 exit_overflow:
1370         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1371 exit:
1372         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1373         dst_release(dst);
1374         return NULL;
1375 }
1376
1377 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1378 {
1379         struct tcphdr *th = tcp_hdr(skb);
1380         const struct iphdr *iph = ip_hdr(skb);
1381         struct sock *nsk;
1382         struct request_sock **prev;
1383         /* Find possible connection requests. */
1384         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1385                                                        iph->saddr, iph->daddr);
1386         if (req)
1387                 return tcp_check_req(sk, skb, req, prev);
1388
1389         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1390                         th->source, iph->daddr, th->dest, inet_iif(skb));
1391
1392         if (nsk) {
1393                 if (nsk->sk_state != TCP_TIME_WAIT) {
1394                         bh_lock_sock(nsk);
1395                         return nsk;
1396                 }
1397                 inet_twsk_put(inet_twsk(nsk));
1398                 return NULL;
1399         }
1400
1401 #ifdef CONFIG_SYN_COOKIES
1402         if (!th->rst && !th->syn && th->ack)
1403                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1404 #endif
1405         return sk;
1406 }
1407
1408 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1409 {
1410         const struct iphdr *iph = ip_hdr(skb);
1411
1412         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1413                 if (!tcp_v4_check(skb->len, iph->saddr,
1414                                   iph->daddr, skb->csum)) {
1415                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1416                         return 0;
1417                 }
1418         }
1419
1420         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1421                                        skb->len, IPPROTO_TCP, 0);
1422
1423         if (skb->len <= 76) {
1424                 return __skb_checksum_complete(skb);
1425         }
1426         return 0;
1427 }
1428
1429
1430 /* The socket must have it's spinlock held when we get
1431  * here.
1432  *
1433  * We have a potential double-lock case here, so even when
1434  * doing backlog processing we use the BH locking scheme.
1435  * This is because we cannot sleep with the original spinlock
1436  * held.
1437  */
1438 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1439 {
1440         struct sock *rsk;
1441 #ifdef CONFIG_TCP_MD5SIG
1442         /*
1443          * We really want to reject the packet as early as possible
1444          * if:
1445          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1446          *  o There is an MD5 option and we're not expecting one
1447          */
1448         if (tcp_v4_inbound_md5_hash(sk, skb))
1449                 goto discard;
1450 #endif
1451
1452         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1453                 TCP_CHECK_TIMER(sk);
1454                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1455                         rsk = sk;
1456                         goto reset;
1457                 }
1458                 TCP_CHECK_TIMER(sk);
1459                 return 0;
1460         }
1461
1462         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1463                 goto csum_err;
1464
1465         if (sk->sk_state == TCP_LISTEN) {
1466                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1467                 if (!nsk)
1468                         goto discard;
1469
1470                 if (nsk != sk) {
1471                         if (tcp_child_process(sk, nsk, skb)) {
1472                                 rsk = nsk;
1473                                 goto reset;
1474                         }
1475                         return 0;
1476                 }
1477         }
1478
1479         TCP_CHECK_TIMER(sk);
1480         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1481                 rsk = sk;
1482                 goto reset;
1483         }
1484         TCP_CHECK_TIMER(sk);
1485         return 0;
1486
1487 reset:
1488         tcp_v4_send_reset(rsk, skb);
1489 discard:
1490         kfree_skb(skb);
1491         /* Be careful here. If this function gets more complicated and
1492          * gcc suffers from register pressure on the x86, sk (in %ebx)
1493          * might be destroyed here. This current version compiles correctly,
1494          * but you have been warned.
1495          */
1496         return 0;
1497
1498 csum_err:
1499         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1500         goto discard;
1501 }
1502
1503 /*
1504  *      From tcp_input.c
1505  */
1506
1507 int tcp_v4_rcv(struct sk_buff *skb)
1508 {
1509         const struct iphdr *iph;
1510         struct tcphdr *th;
1511         struct sock *sk;
1512         int ret;
1513
1514         if (skb->pkt_type != PACKET_HOST)
1515                 goto discard_it;
1516
1517         /* Count it even if it's bad */
1518         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1519
1520         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1521                 goto discard_it;
1522
1523         th = tcp_hdr(skb);
1524
1525         if (th->doff < sizeof(struct tcphdr) / 4)
1526                 goto bad_packet;
1527         if (!pskb_may_pull(skb, th->doff * 4))
1528                 goto discard_it;
1529
1530         /* An explanation is required here, I think.
1531          * Packet length and doff are validated by header prediction,
1532          * provided case of th->doff==0 is eliminated.
1533          * So, we defer the checks. */
1534         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1535                 goto bad_packet;
1536
1537         th = tcp_hdr(skb);
1538         iph = ip_hdr(skb);
1539         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1540         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1541                                     skb->len - th->doff * 4);
1542         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1543         TCP_SKB_CB(skb)->when    = 0;
1544         TCP_SKB_CB(skb)->flags   = iph->tos;
1545         TCP_SKB_CB(skb)->sacked  = 0;
1546
1547         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1548                         th->source, iph->daddr, th->dest, inet_iif(skb));
1549         if (!sk)
1550                 goto no_tcp_socket;
1551
1552 process:
1553         if (sk->sk_state == TCP_TIME_WAIT)
1554                 goto do_time_wait;
1555
1556         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1557                 goto discard_and_relse;
1558         nf_reset(skb);
1559
1560         if (sk_filter(sk, skb))
1561                 goto discard_and_relse;
1562
1563         skb->dev = NULL;
1564
1565         bh_lock_sock_nested(sk);
1566         ret = 0;
1567         if (!sock_owned_by_user(sk)) {
1568 #ifdef CONFIG_NET_DMA
1569                 struct tcp_sock *tp = tcp_sk(sk);
1570                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1571                         tp->ucopy.dma_chan = get_softnet_dma();
1572                 if (tp->ucopy.dma_chan)
1573                         ret = tcp_v4_do_rcv(sk, skb);
1574                 else
1575 #endif
1576                 {
1577                         if (!tcp_prequeue(sk, skb))
1578                         ret = tcp_v4_do_rcv(sk, skb);
1579                 }
1580         } else
1581                 sk_add_backlog(sk, skb);
1582         bh_unlock_sock(sk);
1583
1584         sock_put(sk);
1585
1586         return ret;
1587
1588 no_tcp_socket:
1589         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1590                 goto discard_it;
1591
1592         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1593 bad_packet:
1594                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1595         } else {
1596                 tcp_v4_send_reset(NULL, skb);
1597         }
1598
1599 discard_it:
1600         /* Discard frame. */
1601         kfree_skb(skb);
1602         return 0;
1603
1604 discard_and_relse:
1605         sock_put(sk);
1606         goto discard_it;
1607
1608 do_time_wait:
1609         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1610                 inet_twsk_put(inet_twsk(sk));
1611                 goto discard_it;
1612         }
1613
1614         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1615                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1616                 inet_twsk_put(inet_twsk(sk));
1617                 goto discard_it;
1618         }
1619         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1620         case TCP_TW_SYN: {
1621                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1622                                                         &tcp_hashinfo,
1623                                                         iph->daddr, th->dest,
1624                                                         inet_iif(skb));
1625                 if (sk2) {
1626                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1627                         inet_twsk_put(inet_twsk(sk));
1628                         sk = sk2;
1629                         goto process;
1630                 }
1631                 /* Fall through to ACK */
1632         }
1633         case TCP_TW_ACK:
1634                 tcp_v4_timewait_ack(sk, skb);
1635                 break;
1636         case TCP_TW_RST:
1637                 goto no_tcp_socket;
1638         case TCP_TW_SUCCESS:;
1639         }
1640         goto discard_it;
1641 }
1642
1643 /* VJ's idea. Save last timestamp seen from this destination
1644  * and hold it at least for normal timewait interval to use for duplicate
1645  * segment detection in subsequent connections, before they enter synchronized
1646  * state.
1647  */
1648
1649 int tcp_v4_remember_stamp(struct sock *sk)
1650 {
1651         struct inet_sock *inet = inet_sk(sk);
1652         struct tcp_sock *tp = tcp_sk(sk);
1653         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1654         struct inet_peer *peer = NULL;
1655         int release_it = 0;
1656
1657         if (!rt || rt->rt_dst != inet->daddr) {
1658                 peer = inet_getpeer(inet->daddr, 1);
1659                 release_it = 1;
1660         } else {
1661                 if (!rt->peer)
1662                         rt_bind_peer(rt, 1);
1663                 peer = rt->peer;
1664         }
1665
1666         if (peer) {
1667                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1668                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1669                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1670                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1671                         peer->tcp_ts = tp->rx_opt.ts_recent;
1672                 }
1673                 if (release_it)
1674                         inet_putpeer(peer);
1675                 return 1;
1676         }
1677
1678         return 0;
1679 }
1680
1681 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1682 {
1683         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1684
1685         if (peer) {
1686                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1687
1688                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1689                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1690                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1691                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1692                         peer->tcp_ts       = tcptw->tw_ts_recent;
1693                 }
1694                 inet_putpeer(peer);
1695                 return 1;
1696         }
1697
1698         return 0;
1699 }
1700
1701 struct inet_connection_sock_af_ops ipv4_specific = {
1702         .queue_xmit        = ip_queue_xmit,
1703         .send_check        = tcp_v4_send_check,
1704         .rebuild_header    = inet_sk_rebuild_header,
1705         .conn_request      = tcp_v4_conn_request,
1706         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1707         .remember_stamp    = tcp_v4_remember_stamp,
1708         .net_header_len    = sizeof(struct iphdr),
1709         .setsockopt        = ip_setsockopt,
1710         .getsockopt        = ip_getsockopt,
1711         .addr2sockaddr     = inet_csk_addr2sockaddr,
1712         .sockaddr_len      = sizeof(struct sockaddr_in),
1713         .bind_conflict     = inet_csk_bind_conflict,
1714 #ifdef CONFIG_COMPAT
1715         .compat_setsockopt = compat_ip_setsockopt,
1716         .compat_getsockopt = compat_ip_getsockopt,
1717 #endif
1718 };
1719
1720 #ifdef CONFIG_TCP_MD5SIG
1721 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1722         .md5_lookup             = tcp_v4_md5_lookup,
1723         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1724         .md5_add                = tcp_v4_md5_add_func,
1725         .md5_parse              = tcp_v4_parse_md5_keys,
1726 };
1727 #endif
1728
1729 /* NOTE: A lot of things set to zero explicitly by call to
1730  *       sk_alloc() so need not be done here.
1731  */
1732 static int tcp_v4_init_sock(struct sock *sk)
1733 {
1734         struct inet_connection_sock *icsk = inet_csk(sk);
1735         struct tcp_sock *tp = tcp_sk(sk);
1736
1737         skb_queue_head_init(&tp->out_of_order_queue);
1738         tcp_init_xmit_timers(sk);
1739         tcp_prequeue_init(tp);
1740
1741         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1742         tp->mdev = TCP_TIMEOUT_INIT;
1743
1744         /* So many TCP implementations out there (incorrectly) count the
1745          * initial SYN frame in their delayed-ACK and congestion control
1746          * algorithms that we must have the following bandaid to talk
1747          * efficiently to them.  -DaveM
1748          */
1749         tp->snd_cwnd = 2;
1750
1751         /* See draft-stevens-tcpca-spec-01 for discussion of the
1752          * initialization of these values.
1753          */
1754         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1755         tp->snd_cwnd_clamp = ~0;
1756         tp->mss_cache = 536;
1757
1758         tp->reordering = sysctl_tcp_reordering;
1759         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1760
1761         sk->sk_state = TCP_CLOSE;
1762
1763         sk->sk_write_space = sk_stream_write_space;
1764         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1765
1766         icsk->icsk_af_ops = &ipv4_specific;
1767         icsk->icsk_sync_mss = tcp_sync_mss;
1768 #ifdef CONFIG_TCP_MD5SIG
1769         tp->af_specific = &tcp_sock_ipv4_specific;
1770 #endif
1771
1772         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1773         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1774
1775         atomic_inc(&tcp_sockets_allocated);
1776
1777         return 0;
1778 }
1779
1780 int tcp_v4_destroy_sock(struct sock *sk)
1781 {
1782         struct tcp_sock *tp = tcp_sk(sk);
1783
1784         tcp_clear_xmit_timers(sk);
1785
1786         tcp_cleanup_congestion_control(sk);
1787
1788         /* Cleanup up the write buffer. */
1789         tcp_write_queue_purge(sk);
1790
1791         /* Cleans up our, hopefully empty, out_of_order_queue. */
1792         __skb_queue_purge(&tp->out_of_order_queue);
1793
1794 #ifdef CONFIG_TCP_MD5SIG
1795         /* Clean up the MD5 key list, if any */
1796         if (tp->md5sig_info) {
1797                 tcp_v4_clear_md5_list(sk);
1798                 kfree(tp->md5sig_info);
1799                 tp->md5sig_info = NULL;
1800         }
1801 #endif
1802
1803 #ifdef CONFIG_NET_DMA
1804         /* Cleans up our sk_async_wait_queue */
1805         __skb_queue_purge(&sk->sk_async_wait_queue);
1806 #endif
1807
1808         /* Clean prequeue, it must be empty really */
1809         __skb_queue_purge(&tp->ucopy.prequeue);
1810
1811         /* Clean up a referenced TCP bind bucket. */
1812         if (inet_csk(sk)->icsk_bind_hash)
1813                 inet_put_port(sk);
1814
1815         /*
1816          * If sendmsg cached page exists, toss it.
1817          */
1818         if (sk->sk_sndmsg_page) {
1819                 __free_page(sk->sk_sndmsg_page);
1820                 sk->sk_sndmsg_page = NULL;
1821         }
1822
1823         if (tp->defer_tcp_accept.request) {
1824                 reqsk_free(tp->defer_tcp_accept.request);
1825                 sock_put(tp->defer_tcp_accept.listen_sk);
1826                 sock_put(sk);
1827                 tp->defer_tcp_accept.listen_sk = NULL;
1828                 tp->defer_tcp_accept.request = NULL;
1829         }
1830
1831         atomic_dec(&tcp_sockets_allocated);
1832
1833         return 0;
1834 }
1835
1836 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1837
1838 #ifdef CONFIG_PROC_FS
1839 /* Proc filesystem TCP sock list dumping. */
1840
1841 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1842 {
1843         return hlist_empty(head) ? NULL :
1844                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1845 }
1846
1847 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1848 {
1849         return tw->tw_node.next ?
1850                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1851 }
1852
1853 static void *listening_get_next(struct seq_file *seq, void *cur)
1854 {
1855         struct inet_connection_sock *icsk;
1856         struct hlist_node *node;
1857         struct sock *sk = cur;
1858         struct tcp_iter_state* st = seq->private;
1859         struct net *net = seq_file_net(seq);
1860
1861         if (!sk) {
1862                 st->bucket = 0;
1863                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1864                 goto get_sk;
1865         }
1866
1867         ++st->num;
1868
1869         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1870                 struct request_sock *req = cur;
1871
1872                 icsk = inet_csk(st->syn_wait_sk);
1873                 req = req->dl_next;
1874                 while (1) {
1875                         while (req) {
1876                                 if (req->rsk_ops->family == st->family &&
1877                                     net_eq(sock_net(req->sk), net)) {
1878                                         cur = req;
1879                                         goto out;
1880                                 }
1881                                 req = req->dl_next;
1882                         }
1883                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1884                                 break;
1885 get_req:
1886                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1887                 }
1888                 sk        = sk_next(st->syn_wait_sk);
1889                 st->state = TCP_SEQ_STATE_LISTENING;
1890                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1891         } else {
1892                 icsk = inet_csk(sk);
1893                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1894                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1895                         goto start_req;
1896                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1897                 sk = sk_next(sk);
1898         }
1899 get_sk:
1900         sk_for_each_from(sk, node) {
1901                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1902                         cur = sk;
1903                         goto out;
1904                 }
1905                 icsk = inet_csk(sk);
1906                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1907                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1908 start_req:
1909                         st->uid         = sock_i_uid(sk);
1910                         st->syn_wait_sk = sk;
1911                         st->state       = TCP_SEQ_STATE_OPENREQ;
1912                         st->sbucket     = 0;
1913                         goto get_req;
1914                 }
1915                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1916         }
1917         if (++st->bucket < INET_LHTABLE_SIZE) {
1918                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1919                 goto get_sk;
1920         }
1921         cur = NULL;
1922 out:
1923         return cur;
1924 }
1925
1926 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1927 {
1928         void *rc = listening_get_next(seq, NULL);
1929
1930         while (rc && *pos) {
1931                 rc = listening_get_next(seq, rc);
1932                 --*pos;
1933         }
1934         return rc;
1935 }
1936
1937 static void *established_get_first(struct seq_file *seq)
1938 {
1939         struct tcp_iter_state* st = seq->private;
1940         struct net *net = seq_file_net(seq);
1941         void *rc = NULL;
1942
1943         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1944                 struct sock *sk;
1945                 struct hlist_node *node;
1946                 struct inet_timewait_sock *tw;
1947                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1948
1949                 read_lock_bh(lock);
1950                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1951                         if (sk->sk_family != st->family ||
1952                             !net_eq(sock_net(sk), net)) {
1953                                 continue;
1954                         }
1955                         rc = sk;
1956                         goto out;
1957                 }
1958                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1959                 inet_twsk_for_each(tw, node,
1960                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1961                         if (tw->tw_family != st->family ||
1962                             !net_eq(twsk_net(tw), net)) {
1963                                 continue;
1964                         }
1965                         rc = tw;
1966                         goto out;
1967                 }
1968                 read_unlock_bh(lock);
1969                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1970         }
1971 out:
1972         return rc;
1973 }
1974
1975 static void *established_get_next(struct seq_file *seq, void *cur)
1976 {
1977         struct sock *sk = cur;
1978         struct inet_timewait_sock *tw;
1979         struct hlist_node *node;
1980         struct tcp_iter_state* st = seq->private;
1981         struct net *net = seq_file_net(seq);
1982
1983         ++st->num;
1984
1985         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1986                 tw = cur;
1987                 tw = tw_next(tw);
1988 get_tw:
1989                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1990                         tw = tw_next(tw);
1991                 }
1992                 if (tw) {
1993                         cur = tw;
1994                         goto out;
1995                 }
1996                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1997                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1998
1999                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2000                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2001                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2002                 } else {
2003                         cur = NULL;
2004                         goto out;
2005                 }
2006         } else
2007                 sk = sk_next(sk);
2008
2009         sk_for_each_from(sk, node) {
2010                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2011                         goto found;
2012         }
2013
2014         st->state = TCP_SEQ_STATE_TIME_WAIT;
2015         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2016         goto get_tw;
2017 found:
2018         cur = sk;
2019 out:
2020         return cur;
2021 }
2022
2023 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2024 {
2025         void *rc = established_get_first(seq);
2026
2027         while (rc && pos) {
2028                 rc = established_get_next(seq, rc);
2029                 --pos;
2030         }
2031         return rc;
2032 }
2033
2034 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2035 {
2036         void *rc;
2037         struct tcp_iter_state* st = seq->private;
2038
2039         inet_listen_lock(&tcp_hashinfo);
2040         st->state = TCP_SEQ_STATE_LISTENING;
2041         rc        = listening_get_idx(seq, &pos);
2042
2043         if (!rc) {
2044                 inet_listen_unlock(&tcp_hashinfo);
2045                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2046                 rc        = established_get_idx(seq, pos);
2047         }
2048
2049         return rc;
2050 }
2051
2052 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2053 {
2054         struct tcp_iter_state* st = seq->private;
2055         st->state = TCP_SEQ_STATE_LISTENING;
2056         st->num = 0;
2057         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2058 }
2059
2060 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2061 {
2062         void *rc = NULL;
2063         struct tcp_iter_state* st;
2064
2065         if (v == SEQ_START_TOKEN) {
2066                 rc = tcp_get_idx(seq, 0);
2067                 goto out;
2068         }
2069         st = seq->private;
2070
2071         switch (st->state) {
2072         case TCP_SEQ_STATE_OPENREQ:
2073         case TCP_SEQ_STATE_LISTENING:
2074                 rc = listening_get_next(seq, v);
2075                 if (!rc) {
2076                         inet_listen_unlock(&tcp_hashinfo);
2077                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2078                         rc        = established_get_first(seq);
2079                 }
2080                 break;
2081         case TCP_SEQ_STATE_ESTABLISHED:
2082         case TCP_SEQ_STATE_TIME_WAIT:
2083                 rc = established_get_next(seq, v);
2084                 break;
2085         }
2086 out:
2087         ++*pos;
2088         return rc;
2089 }
2090
2091 static void tcp_seq_stop(struct seq_file *seq, void *v)
2092 {
2093         struct tcp_iter_state* st = seq->private;
2094
2095         switch (st->state) {
2096         case TCP_SEQ_STATE_OPENREQ:
2097                 if (v) {
2098                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2099                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2100                 }
2101         case TCP_SEQ_STATE_LISTENING:
2102                 if (v != SEQ_START_TOKEN)
2103                         inet_listen_unlock(&tcp_hashinfo);
2104                 break;
2105         case TCP_SEQ_STATE_TIME_WAIT:
2106         case TCP_SEQ_STATE_ESTABLISHED:
2107                 if (v)
2108                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2109                 break;
2110         }
2111 }
2112
2113 static int tcp_seq_open(struct inode *inode, struct file *file)
2114 {
2115         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2116         struct tcp_iter_state *s;
2117         int err;
2118
2119         err = seq_open_net(inode, file, &afinfo->seq_ops,
2120                           sizeof(struct tcp_iter_state));
2121         if (err < 0)
2122                 return err;
2123
2124         s = ((struct seq_file *)file->private_data)->private;
2125         s->family               = afinfo->family;
2126         return 0;
2127 }
2128
2129 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2130 {
2131         int rc = 0;
2132         struct proc_dir_entry *p;
2133
2134         afinfo->seq_fops.open           = tcp_seq_open;
2135         afinfo->seq_fops.read           = seq_read;
2136         afinfo->seq_fops.llseek         = seq_lseek;
2137         afinfo->seq_fops.release        = seq_release_net;
2138
2139         afinfo->seq_ops.start           = tcp_seq_start;
2140         afinfo->seq_ops.next            = tcp_seq_next;
2141         afinfo->seq_ops.stop            = tcp_seq_stop;
2142
2143         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2144                              &afinfo->seq_fops, afinfo);
2145         if (!p)
2146                 rc = -ENOMEM;
2147         return rc;
2148 }
2149
2150 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2151 {
2152         proc_net_remove(net, afinfo->name);
2153 }
2154
2155 static void get_openreq4(struct sock *sk, struct request_sock *req,
2156                          struct seq_file *f, int i, int uid, int *len)
2157 {
2158         const struct inet_request_sock *ireq = inet_rsk(req);
2159         int ttd = req->expires - jiffies;
2160
2161         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2162                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2163                 i,
2164                 ireq->loc_addr,
2165                 ntohs(inet_sk(sk)->sport),
2166                 ireq->rmt_addr,
2167                 ntohs(ireq->rmt_port),
2168                 TCP_SYN_RECV,
2169                 0, 0, /* could print option size, but that is af dependent. */
2170                 1,    /* timers active (only the expire timer) */
2171                 jiffies_to_clock_t(ttd),
2172                 req->retrans,
2173                 uid,
2174                 0,  /* non standard timer */
2175                 0, /* open_requests have no inode */
2176                 atomic_read(&sk->sk_refcnt),
2177                 req,
2178                 len);
2179 }
2180
2181 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2182 {
2183         int timer_active;
2184         unsigned long timer_expires;
2185         struct tcp_sock *tp = tcp_sk(sk);
2186         const struct inet_connection_sock *icsk = inet_csk(sk);
2187         struct inet_sock *inet = inet_sk(sk);
2188         __be32 dest = inet->daddr;
2189         __be32 src = inet->rcv_saddr;
2190         __u16 destp = ntohs(inet->dport);
2191         __u16 srcp = ntohs(inet->sport);
2192
2193         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2194                 timer_active    = 1;
2195                 timer_expires   = icsk->icsk_timeout;
2196         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2197                 timer_active    = 4;
2198                 timer_expires   = icsk->icsk_timeout;
2199         } else if (timer_pending(&sk->sk_timer)) {
2200                 timer_active    = 2;
2201                 timer_expires   = sk->sk_timer.expires;
2202         } else {
2203                 timer_active    = 0;
2204                 timer_expires = jiffies;
2205         }
2206
2207         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2208                         "%08X %5d %8d %lu %d %p %u %u %u %u %d%n",
2209                 i, src, srcp, dest, destp, sk->sk_state,
2210                 tp->write_seq - tp->snd_una,
2211                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2212                                              (tp->rcv_nxt - tp->copied_seq),
2213                 timer_active,
2214                 jiffies_to_clock_t(timer_expires - jiffies),
2215                 icsk->icsk_retransmits,
2216                 sock_i_uid(sk),
2217                 icsk->icsk_probes_out,
2218                 sock_i_ino(sk),
2219                 atomic_read(&sk->sk_refcnt), sk,
2220                 icsk->icsk_rto,
2221                 icsk->icsk_ack.ato,
2222                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2223                 tp->snd_cwnd,
2224                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2225                 len);
2226 }
2227
2228 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2229                                struct seq_file *f, int i, int *len)
2230 {
2231         __be32 dest, src;
2232         __u16 destp, srcp;
2233         int ttd = tw->tw_ttd - jiffies;
2234
2235         if (ttd < 0)
2236                 ttd = 0;
2237
2238         dest  = tw->tw_daddr;
2239         src   = tw->tw_rcv_saddr;
2240         destp = ntohs(tw->tw_dport);
2241         srcp  = ntohs(tw->tw_sport);
2242
2243         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2244                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2245                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2246                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2247                 atomic_read(&tw->tw_refcnt), tw, len);
2248 }
2249
2250 #define TMPSZ 150
2251
2252 static int tcp4_seq_show(struct seq_file *seq, void *v)
2253 {
2254         struct tcp_iter_state* st;
2255         int len;
2256
2257         if (v == SEQ_START_TOKEN) {
2258                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2259                            "  sl  local_address rem_address   st tx_queue "
2260                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2261                            "inode");
2262                 goto out;
2263         }
2264         st = seq->private;
2265
2266         switch (st->state) {
2267         case TCP_SEQ_STATE_LISTENING:
2268         case TCP_SEQ_STATE_ESTABLISHED:
2269                 get_tcp4_sock(v, seq, st->num, &len);
2270                 break;
2271         case TCP_SEQ_STATE_OPENREQ:
2272                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2273                 break;
2274         case TCP_SEQ_STATE_TIME_WAIT:
2275                 get_timewait4_sock(v, seq, st->num, &len);
2276                 break;
2277         }
2278         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2279 out:
2280         return 0;
2281 }
2282
2283 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2284         .name           = "tcp",
2285         .family         = AF_INET,
2286         .seq_fops       = {
2287                 .owner          = THIS_MODULE,
2288         },
2289         .seq_ops        = {
2290                 .show           = tcp4_seq_show,
2291         },
2292 };
2293
2294 static int tcp4_proc_init_net(struct net *net)
2295 {
2296         return tcp_proc_register(net, &tcp4_seq_afinfo);
2297 }
2298
2299 static void tcp4_proc_exit_net(struct net *net)
2300 {
2301         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2302 }
2303
2304 static struct pernet_operations tcp4_net_ops = {
2305         .init = tcp4_proc_init_net,
2306         .exit = tcp4_proc_exit_net,
2307 };
2308
2309 int __init tcp4_proc_init(void)
2310 {
2311         return register_pernet_subsys(&tcp4_net_ops);
2312 }
2313
2314 void tcp4_proc_exit(void)
2315 {
2316         unregister_pernet_subsys(&tcp4_net_ops);
2317 }
2318 #endif /* CONFIG_PROC_FS */
2319
2320 struct proto tcp_prot = {
2321         .name                   = "TCP",
2322         .owner                  = THIS_MODULE,
2323         .close                  = tcp_close,
2324         .connect                = tcp_v4_connect,
2325         .disconnect             = tcp_disconnect,
2326         .accept                 = inet_csk_accept,
2327         .ioctl                  = tcp_ioctl,
2328         .init                   = tcp_v4_init_sock,
2329         .destroy                = tcp_v4_destroy_sock,
2330         .shutdown               = tcp_shutdown,
2331         .setsockopt             = tcp_setsockopt,
2332         .getsockopt             = tcp_getsockopt,
2333         .recvmsg                = tcp_recvmsg,
2334         .backlog_rcv            = tcp_v4_do_rcv,
2335         .hash                   = inet_hash,
2336         .unhash                 = inet_unhash,
2337         .get_port               = inet_csk_get_port,
2338         .enter_memory_pressure  = tcp_enter_memory_pressure,
2339         .sockets_allocated      = &tcp_sockets_allocated,
2340         .orphan_count           = &tcp_orphan_count,
2341         .memory_allocated       = &tcp_memory_allocated,
2342         .memory_pressure        = &tcp_memory_pressure,
2343         .sysctl_mem             = sysctl_tcp_mem,
2344         .sysctl_wmem            = sysctl_tcp_wmem,
2345         .sysctl_rmem            = sysctl_tcp_rmem,
2346         .max_header             = MAX_TCP_HEADER,
2347         .obj_size               = sizeof(struct tcp_sock),
2348         .twsk_prot              = &tcp_timewait_sock_ops,
2349         .rsk_prot               = &tcp_request_sock_ops,
2350         .h.hashinfo             = &tcp_hashinfo,
2351 #ifdef CONFIG_COMPAT
2352         .compat_setsockopt      = compat_tcp_setsockopt,
2353         .compat_getsockopt      = compat_tcp_getsockopt,
2354 #endif
2355 };
2356
2357
2358 static int __net_init tcp_sk_init(struct net *net)
2359 {
2360         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2361                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2362 }
2363
2364 static void __net_exit tcp_sk_exit(struct net *net)
2365 {
2366         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2367 }
2368
2369 static struct pernet_operations __net_initdata tcp_sk_ops = {
2370        .init = tcp_sk_init,
2371        .exit = tcp_sk_exit,
2372 };
2373
2374 void __init tcp_v4_init(void)
2375 {
2376         if (register_pernet_device(&tcp_sk_ops))
2377                 panic("Failed to create the TCP control socket.\n");
2378 }
2379
2380 EXPORT_SYMBOL(ipv4_specific);
2381 EXPORT_SYMBOL(tcp_hashinfo);
2382 EXPORT_SYMBOL(tcp_prot);
2383 EXPORT_SYMBOL(tcp_v4_conn_request);
2384 EXPORT_SYMBOL(tcp_v4_connect);
2385 EXPORT_SYMBOL(tcp_v4_do_rcv);
2386 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2387 EXPORT_SYMBOL(tcp_v4_send_check);
2388 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2389
2390 #ifdef CONFIG_PROC_FS
2391 EXPORT_SYMBOL(tcp_proc_register);
2392 EXPORT_SYMBOL(tcp_proc_unregister);
2393 #endif
2394 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2395