git.oblomov.eu Git - linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen sematics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55 #include <linux/config.h>
  56
  57 #include <linux/types.h>
  58 #include <linux/fcntl.h>
  59 #include <linux/module.h>
  60 #include <linux/random.h>
  61 #include <linux/cache.h>
  62 #include <linux/jhash.h>
  63 #include <linux/init.h>
  64 #include <linux/times.h>
  65
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/xfrm.h>
  73
  74 #include <linux/inet.h>
  75 #include <linux/ipv6.h>
  76 #include <linux/stddef.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/seq_file.h>
  79
  80 int sysctl_tcp_tw_reuse;
  81 int sysctl_tcp_low_latency;
  82
  83 /* Check TCP sequence numbers in ICMP packets. */
  84 #define ICMP_MIN_LENGTH 8
  85
  86 /* Socket used for sending RSTs */
  87 static struct socket *tcp_socket;
  88
  89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  90                        struct sk_buff *skb);
  91
  92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
  93         .lhash_lock     = RW_LOCK_UNLOCKED,
  94         .lhash_users    = ATOMIC_INIT(0),
  95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
  96 };
  97
  98 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
  99 {
 100         return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 101 }
 102
 103 static void tcp_v4_hash(struct sock *sk)
 104 {
 105         inet_hash(&tcp_hashinfo, sk);
 106 }
 107
 108 void tcp_unhash(struct sock *sk)
 109 {
 110         inet_unhash(&tcp_hashinfo, sk);
 111 }
 112
 113 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 114 {
 115         return secure_tcp_sequence_number(skb->nh.iph->daddr,
 116                                           skb->nh.iph->saddr,
 117                                           skb->h.th->dest,
 118                                           skb->h.th->source);
 119 }
 120
 121 /* called with local bh disabled */
 122 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 123                                       struct inet_timewait_sock **twp)
 124 {
 125         struct inet_sock *inet = inet_sk(sk);
 126         u32 daddr = inet->rcv_saddr;
 127         u32 saddr = inet->daddr;
 128         int dif = sk->sk_bound_dev_if;
 129         INET_ADDR_COOKIE(acookie, saddr, daddr)
 130         const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
 131         unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 132         struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
 133         struct sock *sk2;
 134         const struct hlist_node *node;
 135         struct inet_timewait_sock *tw;
 136
 137         prefetch(head->chain.first);
 138         write_lock(&head->lock);
 139
 140         /* Check TIME-WAIT sockets first. */
 141         sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 142                 tw = inet_twsk(sk2);
 143
 144                 if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
 145                         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 146                         struct tcp_sock *tp = tcp_sk(sk);
 147
 148                         /* With PAWS, it is safe from the viewpoint
 149                            of data integrity. Even without PAWS it
 150                            is safe provided sequence spaces do not
 151                            overlap i.e. at data rates <= 80Mbit/sec.
 152
 153                            Actually, the idea is close to VJ's one,
 154                            only timestamp cache is held not per host,
 155                            but per port pair and TW bucket is used
 156                            as state holder.
 157
 158                            If TW bucket has been already destroyed we
 159                            fall back to VJ's scheme and use initial
 160                            timestamp retrieved from peer table.
 161                          */
 162                         if (tcptw->tw_ts_recent_stamp &&
 163                             (!twp || (sysctl_tcp_tw_reuse &&
 164                                       xtime.tv_sec -
 165                                       tcptw->tw_ts_recent_stamp > 1))) {
 166                                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 167                                 if (tp->write_seq == 0)
 168                                         tp->write_seq = 1;
 169                                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 170                                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 171                                 sock_hold(sk2);
 172                                 goto unique;
 173                         } else
 174                                 goto not_unique;
 175                 }
 176         }
 177         tw = NULL;
 178
 179         /* And established part... */
 180         sk_for_each(sk2, node, &head->chain) {
 181                 if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
 182                         goto not_unique;
 183         }
 184
 185 unique:
 186         /* Must record num and sport now. Otherwise we will see
 187          * in hash table socket with a funny identity. */
 188         inet->num = lport;
 189         inet->sport = htons(lport);
 190         sk->sk_hash = hash;
 191         BUG_TRAP(sk_unhashed(sk));
 192         __sk_add_node(sk, &head->chain);
 193         sock_prot_inc_use(sk->sk_prot);
 194         write_unlock(&head->lock);
 195
 196         if (twp) {
 197                 *twp = tw;
 198                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 199         } else if (tw) {
 200                 /* Silly. Should hash-dance instead... */
 201                 inet_twsk_deschedule(tw, &tcp_death_row);
 202                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 203
 204                 inet_twsk_put(tw);
 205         }
 206
 207         return 0;
 208
 209 not_unique:
 210         write_unlock(&head->lock);
 211         return -EADDRNOTAVAIL;
 212 }
 213
 214 static inline u32 connect_port_offset(const struct sock *sk)
 215 {
 216         const struct inet_sock *inet = inet_sk(sk);
 217
 218         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
 219                                          inet->dport);
 220 }
 221
 222 /*
 223  * Bind a port for a connect operation and hash it.
 224  */
 225 static inline int tcp_v4_hash_connect(struct sock *sk)
 226 {
 227         const unsigned short snum = inet_sk(sk)->num;
 228         struct inet_bind_hashbucket *head;
 229         struct inet_bind_bucket *tb;
 230         int ret;
 231
 232         if (!snum) {
 233                 int low = sysctl_local_port_range[0];
 234                 int high = sysctl_local_port_range[1];
 235                 int range = high - low;
 236                 int i;
 237                 int port;
 238                 static u32 hint;
 239                 u32 offset = hint + connect_port_offset(sk);
 240                 struct hlist_node *node;
 241                 struct inet_timewait_sock *tw = NULL;
 242
 243                 local_bh_disable();
 244                 for (i = 1; i <= range; i++) {
 245                         port = low + (i + offset) % range;
 246                         head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
 247                         spin_lock(&head->lock);
 248
 249                         /* Does not bother with rcv_saddr checks,
 250                          * because the established check is already
 251                          * unique enough.
 252                          */
 253                         inet_bind_bucket_for_each(tb, node, &head->chain) {
 254                                 if (tb->port == port) {
 255                                         BUG_TRAP(!hlist_empty(&tb->owners));
 256                                         if (tb->fastreuse >= 0)
 257                                                 goto next_port;
 258                                         if (!__tcp_v4_check_established(sk,
 259                                                                         port,
 260                                                                         &tw))
 261                                                 goto ok;
 262                                         goto next_port;
 263                                 }
 264                         }
 265
 266                         tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
 267                         if (!tb) {
 268                                 spin_unlock(&head->lock);
 269                                 break;
 270                         }
 271                         tb->fastreuse = -1;
 272                         goto ok;
 273
 274                 next_port:
 275                         spin_unlock(&head->lock);
 276                 }
 277                 local_bh_enable();
 278
 279                 return -EADDRNOTAVAIL;
 280
 281 ok:
 282                 hint += i;
 283
 284                 /* Head lock still held and bh's disabled */
 285                 inet_bind_hash(sk, tb, port);
 286                 if (sk_unhashed(sk)) {
 287                         inet_sk(sk)->sport = htons(port);
 288                         __inet_hash(&tcp_hashinfo, sk, 0);
 289                 }
 290                 spin_unlock(&head->lock);
 291
 292                 if (tw) {
 293                         inet_twsk_deschedule(tw, &tcp_death_row);;
 294                         inet_twsk_put(tw);
 295                 }
 296
 297                 ret = 0;
 298                 goto out;
 299         }
 300
 301         head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 302         tb  = inet_csk(sk)->icsk_bind_hash;
 303         spin_lock_bh(&head->lock);
 304         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 305                 __inet_hash(&tcp_hashinfo, sk, 0);
 306                 spin_unlock_bh(&head->lock);
 307                 return 0;
 308         } else {
 309                 spin_unlock(&head->lock);
 310                 /* No definite answer... Walk to established hash table */
 311                 ret = __tcp_v4_check_established(sk, snum, NULL);
 312 out:
 313                 local_bh_enable();
 314                 return ret;
 315         }
 316 }
 317
 318 /* This will initiate an outgoing connection. */
 319 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 320 {
 321         struct inet_sock *inet = inet_sk(sk);
 322         struct tcp_sock *tp = tcp_sk(sk);
 323         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 324         struct rtable *rt;
 325         u32 daddr, nexthop;
 326         int tmp;
 327         int err;
 328
 329         if (addr_len < sizeof(struct sockaddr_in))
 330                 return -EINVAL;
 331
 332         if (usin->sin_family != AF_INET)
 333                 return -EAFNOSUPPORT;
 334
 335         nexthop = daddr = usin->sin_addr.s_addr;
 336         if (inet->opt && inet->opt->srr) {
 337                 if (!daddr)
 338                         return -EINVAL;
 339                 nexthop = inet->opt->faddr;
 340         }
 341
 342         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 343                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 344                                IPPROTO_TCP,
 345                                inet->sport, usin->sin_port, sk);
 346         if (tmp < 0)
 347                 return tmp;
 348
 349         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 350                 ip_rt_put(rt);
 351                 return -ENETUNREACH;
 352         }
 353
 354         if (!inet->opt || !inet->opt->srr)
 355                 daddr = rt->rt_dst;
 356
 357         if (!inet->saddr)
 358                 inet->saddr = rt->rt_src;
 359         inet->rcv_saddr = inet->saddr;
 360
 361         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 362                 /* Reset inherited state */
 363                 tp->rx_opt.ts_recent       = 0;
 364                 tp->rx_opt.ts_recent_stamp = 0;
 365                 tp->write_seq              = 0;
 366         }
 367
 368         if (tcp_death_row.sysctl_tw_recycle &&
 369             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 370                 struct inet_peer *peer = rt_get_peer(rt);
 371
 372                 /* VJ's idea. We save last timestamp seen from
 373                  * the destination in peer table, when entering state TIME-WAIT
 374                  * and initialize rx_opt.ts_recent from it, when trying new connection.
 375                  */
 376
 377                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
 378                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 379                         tp->rx_opt.ts_recent = peer->tcp_ts;
 380                 }
 381         }
 382
 383         inet->dport = usin->sin_port;
 384         inet->daddr = daddr;
 385
 386         tp->ext_header_len = 0;
 387         if (inet->opt)
 388                 tp->ext_header_len = inet->opt->optlen;
 389
 390         tp->rx_opt.mss_clamp = 536;
 391
 392         /* Socket identity is still unknown (sport may be zero).
 393          * However we set state to SYN-SENT and not releasing socket
 394          * lock select source port, enter ourselves into the hash tables and
 395          * complete initialization after this.
 396          */
 397         tcp_set_state(sk, TCP_SYN_SENT);
 398         err = tcp_v4_hash_connect(sk);
 399         if (err)
 400                 goto failure;
 401
 402         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
 403         if (err)
 404                 goto failure;
 405
 406         /* OK, now commit destination to socket.  */
 407         sk_setup_caps(sk, &rt->u.dst);
 408
 409         if (!tp->write_seq)
 410                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 411                                                            inet->daddr,
 412                                                            inet->sport,
 413                                                            usin->sin_port);
 414
 415         inet->id = tp->write_seq ^ jiffies;
 416
 417         err = tcp_connect(sk);
 418         rt = NULL;
 419         if (err)
 420                 goto failure;
 421
 422         return 0;
 423
 424 failure:
 425         /* This unhashes the socket and releases the local port, if necessary. */
 426         tcp_set_state(sk, TCP_CLOSE);
 427         ip_rt_put(rt);
 428         sk->sk_route_caps = 0;
 429         inet->dport = 0;
 430         return err;
 431 }
 432
 433 /*
 434  * This routine does path mtu discovery as defined in RFC1191.
 435  */
 436 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 437                                      u32 mtu)
 438 {
 439         struct dst_entry *dst;
 440         struct inet_sock *inet = inet_sk(sk);
 441         struct tcp_sock *tp = tcp_sk(sk);
 442
 443         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 444          * send out by Linux are always <576bytes so they should go through
 445          * unfragmented).
 446          */
 447         if (sk->sk_state == TCP_LISTEN)
 448                 return;
 449
 450         /* We don't check in the destentry if pmtu discovery is forbidden
 451          * on this route. We just assume that no packet_to_big packets
 452          * are send back when pmtu discovery is not active.
 453          * There is a small race when the user changes this flag in the
 454          * route, but I think that's acceptable.
 455          */
 456         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 457                 return;
 458
 459         dst->ops->update_pmtu(dst, mtu);
 460
 461         /* Something is about to be wrong... Remember soft error
 462          * for the case, if this connection will not able to recover.
 463          */
 464         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 465                 sk->sk_err_soft = EMSGSIZE;
 466
 467         mtu = dst_mtu(dst);
 468
 469         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 470             tp->pmtu_cookie > mtu) {
 471                 tcp_sync_mss(sk, mtu);
 472
 473                 /* Resend the TCP packet because it's
 474                  * clear that the old packet has been
 475                  * dropped. This is the new "fast" path mtu
 476                  * discovery.
 477                  */
 478                 tcp_simple_retransmit(sk);
 479         } /* else let the usual retransmit timer handle it */
 480 }
 481
 482 /*
 483  * This routine is called by the ICMP module when it gets some
 484  * sort of error condition.  If err < 0 then the socket should
 485  * be closed and the error returned to the user.  If err > 0
 486  * it's just the icmp type << 8 | icmp code.  After adjustment
 487  * header points to the first 8 bytes of the tcp header.  We need
 488  * to find the appropriate port.
 489  *
 490  * The locking strategy used here is very "optimistic". When
 491  * someone else accesses the socket the ICMP is just dropped
 492  * and for some paths there is no check at all.
 493  * A more general error queue to queue errors for later handling
 494  * is probably better.
 495  *
 496  */
 497
 498 void tcp_v4_err(struct sk_buff *skb, u32 info)
 499 {
 500         struct iphdr *iph = (struct iphdr *)skb->data;
 501         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 502         struct tcp_sock *tp;
 503         struct inet_sock *inet;
 504         int type = skb->h.icmph->type;
 505         int code = skb->h.icmph->code;
 506         struct sock *sk;
 507         __u32 seq;
 508         int err;
 509
 510         if (skb->len < (iph->ihl << 2) + 8) {
 511                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 512                 return;
 513         }
 514
 515         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
 516                          th->source, inet_iif(skb));
 517         if (!sk) {
 518                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 519                 return;
 520         }
 521         if (sk->sk_state == TCP_TIME_WAIT) {
 522                 inet_twsk_put((struct inet_timewait_sock *)sk);
 523                 return;
 524         }
 525
 526         bh_lock_sock(sk);
 527         /* If too many ICMPs get dropped on busy
 528          * servers this needs to be solved differently.
 529          */
 530         if (sock_owned_by_user(sk))
 531                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 532
 533         if (sk->sk_state == TCP_CLOSE)
 534                 goto out;
 535
 536         tp = tcp_sk(sk);
 537         seq = ntohl(th->seq);
 538         if (sk->sk_state != TCP_LISTEN &&
 539             !between(seq, tp->snd_una, tp->snd_nxt)) {
 540                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
 541                 goto out;
 542         }
 543
 544         switch (type) {
 545         case ICMP_SOURCE_QUENCH:
 546                 /* Just silently ignore these. */
 547                 goto out;
 548         case ICMP_PARAMETERPROB:
 549                 err = EPROTO;
 550                 break;
 551         case ICMP_DEST_UNREACH:
 552                 if (code > NR_ICMP_UNREACH)
 553                         goto out;
 554
 555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 556                         if (!sock_owned_by_user(sk))
 557                                 do_pmtu_discovery(sk, iph, info);
 558                         goto out;
 559                 }
 560
 561                 err = icmp_err_convert[code].errno;
 562                 break;
 563         case ICMP_TIME_EXCEEDED:
 564                 err = EHOSTUNREACH;
 565                 break;
 566         default:
 567                 goto out;
 568         }
 569
 570         switch (sk->sk_state) {
 571                 struct request_sock *req, **prev;
 572         case TCP_LISTEN:
 573                 if (sock_owned_by_user(sk))
 574                         goto out;
 575
 576                 req = inet_csk_search_req(sk, &prev, th->dest,
 577                                           iph->daddr, iph->saddr);
 578                 if (!req)
 579                         goto out;
 580
 581                 /* ICMPs are not backlogged, hence we cannot get
 582                    an established socket here.
 583                  */
 584                 BUG_TRAP(!req->sk);
 585
 586                 if (seq != tcp_rsk(req)->snt_isn) {
 587                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 588                         goto out;
 589                 }
 590
 591                 /*
 592                  * Still in SYN_RECV, just remove it silently.
 593                  * There is no good way to pass the error to the newly
 594                  * created socket, and POSIX does not want network
 595                  * errors returned from accept().
 596                  */
 597                 inet_csk_reqsk_queue_drop(sk, req, prev);
 598                 goto out;
 599
 600         case TCP_SYN_SENT:
 601         case TCP_SYN_RECV:  /* Cannot happen.
 602                                It can f.e. if SYNs crossed.
 603                              */
 604                 if (!sock_owned_by_user(sk)) {
 605                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 606                         sk->sk_err = err;
 607
 608                         sk->sk_error_report(sk);
 609
 610                         tcp_done(sk);
 611                 } else {
 612                         sk->sk_err_soft = err;
 613                 }
 614                 goto out;
 615         }
 616
 617         /* If we've already connected we will keep trying
 618          * until we time out, or the user gives up.
 619          *
 620          * rfc1122 4.2.3.9 allows to consider as hard errors
 621          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 622          * but it is obsoleted by pmtu discovery).
 623          *
 624          * Note, that in modern internet, where routing is unreliable
 625          * and in each dark corner broken firewalls sit, sending random
 626          * errors ordered by their masters even this two messages finally lose
 627          * their original sense (even Linux sends invalid PORT_UNREACHs)
 628          *
 629          * Now we are in compliance with RFCs.
 630          *                                                      --ANK (980905)
 631          */
 632
 633         inet = inet_sk(sk);
 634         if (!sock_owned_by_user(sk) && inet->recverr) {
 635                 sk->sk_err = err;
 636                 sk->sk_error_report(sk);
 637         } else  { /* Only an error on timeout */
 638                 sk->sk_err_soft = err;
 639         }
 640
 641 out:
 642         bh_unlock_sock(sk);
 643         sock_put(sk);
 644 }
 645
 646 /* This routine computes an IPv4 TCP checksum. */
 647 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 648                        struct sk_buff *skb)
 649 {
 650         struct inet_sock *inet = inet_sk(sk);
 651
 652         if (skb->ip_summed == CHECKSUM_HW) {
 653                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 654                 skb->csum = offsetof(struct tcphdr, check);
 655         } else {
 656                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
 657                                          csum_partial((char *)th,
 658                                                       th->doff << 2,
 659                                                       skb->csum));
 660         }
 661 }
 662
 663 /*
 664  *      This routine will send an RST to the other tcp.
 665  *
 666  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 667  *                    for reset.
 668  *      Answer: if a packet caused RST, it is not for a socket
 669  *              existing in our system, if it is matched to a socket,
 670  *              it is just duplicate segment or bug in other side's TCP.
 671  *              So that we build reply only basing on parameters
 672  *              arrived with segment.
 673  *      Exception: precedence violation. We do not implement it in any case.
 674  */
 675
 676 static void tcp_v4_send_reset(struct sk_buff *skb)
 677 {
 678         struct tcphdr *th = skb->h.th;
 679         struct tcphdr rth;
 680         struct ip_reply_arg arg;
 681
 682         /* Never send a reset in response to a reset. */
 683         if (th->rst)
 684                 return;
 685
 686         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 687                 return;
 688
 689         /* Swap the send and the receive. */
 690         memset(&rth, 0, sizeof(struct tcphdr));
 691         rth.dest   = th->source;
 692         rth.source = th->dest;
 693         rth.doff   = sizeof(struct tcphdr) / 4;
 694         rth.rst    = 1;
 695
 696         if (th->ack) {
 697                 rth.seq = th->ack_seq;
 698         } else {
 699                 rth.ack = 1;
 700                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701                                     skb->len - (th->doff << 2));
 702         }
 703
 704         memset(&arg, 0, sizeof arg);
 705         arg.iov[0].iov_base = (unsigned char *)&rth;
 706         arg.iov[0].iov_len  = sizeof rth;
 707         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 708                                       skb->nh.iph->saddr, /*XXX*/
 709                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 710         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 711
 712         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 713
 714         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 715         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 716 }
 717
 718 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 719    outside socket context is ugly, certainly. What can I do?
 720  */
 721
 722 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 723                             u32 win, u32 ts)
 724 {
 725         struct tcphdr *th = skb->h.th;
 726         struct {
 727                 struct tcphdr th;
 728                 u32 tsopt[3];
 729         } rep;
 730         struct ip_reply_arg arg;
 731
 732         memset(&rep.th, 0, sizeof(struct tcphdr));
 733         memset(&arg, 0, sizeof arg);
 734
 735         arg.iov[0].iov_base = (unsigned char *)&rep;
 736         arg.iov[0].iov_len  = sizeof(rep.th);
 737         if (ts) {
 738                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739                                      (TCPOPT_TIMESTAMP << 8) |
 740                                      TCPOLEN_TIMESTAMP);
 741                 rep.tsopt[1] = htonl(tcp_time_stamp);
 742                 rep.tsopt[2] = htonl(ts);
 743                 arg.iov[0].iov_len = sizeof(rep);
 744         }
 745
 746         /* Swap the send and the receive. */
 747         rep.th.dest    = th->source;
 748         rep.th.source  = th->dest;
 749         rep.th.doff    = arg.iov[0].iov_len / 4;
 750         rep.th.seq     = htonl(seq);
 751         rep.th.ack_seq = htonl(ack);
 752         rep.th.ack     = 1;
 753         rep.th.window  = htons(win);
 754
 755         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
 756                                       skb->nh.iph->saddr, /*XXX*/
 757                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 758         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 759
 760         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 761
 762         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 763 }
 764
 765 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 766 {
 767         struct inet_timewait_sock *tw = inet_twsk(sk);
 768         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 769
 770         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 771                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 772
 773         inet_twsk_put(tw);
 774 }
 775
 776 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 777 {
 778         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 779                         req->ts_recent);
 780 }
 781
 782 /*
 783  *      Send a SYN-ACK after having received an ACK.
 784  *      This still operates on a request_sock only, not on a big
 785  *      socket.
 786  */
 787 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 788                               struct dst_entry *dst)
 789 {
 790         const struct inet_request_sock *ireq = inet_rsk(req);
 791         int err = -1;
 792         struct sk_buff * skb;
 793
 794         /* First, grab a route. */
 795         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 796                 goto out;
 797
 798         skb = tcp_make_synack(sk, dst, req);
 799
 800         if (skb) {
 801                 struct tcphdr *th = skb->h.th;
 802
 803                 th->check = tcp_v4_check(th, skb->len,
 804                                          ireq->loc_addr,
 805                                          ireq->rmt_addr,
 806                                          csum_partial((char *)th, skb->len,
 807                                                       skb->csum));
 808
 809                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 810                                             ireq->rmt_addr,
 811                                             ireq->opt);
 812                 if (err == NET_XMIT_CN)
 813                         err = 0;
 814         }
 815
 816 out:
 817         dst_release(dst);
 818         return err;
 819 }
 820
 821 /*
 822  *      IPv4 request_sock destructor.
 823  */
 824 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 825 {
 826         kfree(inet_rsk(req)->opt);
 827 }
 828
 829 static inline void syn_flood_warning(struct sk_buff *skb)
 830 {
 831         static unsigned long warntime;
 832
 833         if (time_after(jiffies, (warntime + HZ * 60))) {
 834                 warntime = jiffies;
 835                 printk(KERN_INFO
 836                        "possible SYN flooding on port %d. Sending cookies.\n",
 837                        ntohs(skb->h.th->dest));
 838         }
 839 }
 840
 841 /*
 842  * Save and compile IPv4 options into the request_sock if needed.
 843  */
 844 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 845                                                      struct sk_buff *skb)
 846 {
 847         struct ip_options *opt = &(IPCB(skb)->opt);
 848         struct ip_options *dopt = NULL;
 849
 850         if (opt && opt->optlen) {
 851                 int opt_size = optlength(opt);
 852                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 853                 if (dopt) {
 854                         if (ip_options_echo(dopt, skb)) {
 855                                 kfree(dopt);
 856                                 dopt = NULL;
 857                         }
 858                 }
 859         }
 860         return dopt;
 861 }
 862
 863 struct request_sock_ops tcp_request_sock_ops = {
 864         .family         =       PF_INET,
 865         .obj_size       =       sizeof(struct tcp_request_sock),
 866         .rtx_syn_ack    =       tcp_v4_send_synack,
 867         .send_ack       =       tcp_v4_reqsk_send_ack,
 868         .destructor     =       tcp_v4_reqsk_destructor,
 869         .send_reset     =       tcp_v4_send_reset,
 870 };
 871
 872 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 873 {
 874         struct inet_request_sock *ireq;
 875         struct tcp_options_received tmp_opt;
 876         struct request_sock *req;
 877         __u32 saddr = skb->nh.iph->saddr;
 878         __u32 daddr = skb->nh.iph->daddr;
 879         __u32 isn = TCP_SKB_CB(skb)->when;
 880         struct dst_entry *dst = NULL;
 881 #ifdef CONFIG_SYN_COOKIES
 882         int want_cookie = 0;
 883 #else
 884 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
 885 #endif
 886
 887         /* Never answer to SYNs send to broadcast or multicast */
 888         if (((struct rtable *)skb->dst)->rt_flags &
 889             (RTCF_BROADCAST | RTCF_MULTICAST))
 890                 goto drop;
 891
 892         /* TW buckets are converted to open requests without
 893          * limitations, they conserve resources and peer is
 894          * evidently real one.
 895          */
 896         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 897 #ifdef CONFIG_SYN_COOKIES
 898                 if (sysctl_tcp_syncookies) {
 899                         want_cookie = 1;
 900                 } else
 901 #endif
 902                 goto drop;
 903         }
 904
 905         /* Accept backlog is full. If we have already queued enough
 906          * of warm entries in syn queue, drop request. It is better than
 907          * clogging syn queue with openreqs with exponentially increasing
 908          * timeout.
 909          */
 910         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 911                 goto drop;
 912
 913         req = reqsk_alloc(&tcp_request_sock_ops);
 914         if (!req)
 915                 goto drop;
 916
 917         tcp_clear_options(&tmp_opt);
 918         tmp_opt.mss_clamp = 536;
 919         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 920
 921         tcp_parse_options(skb, &tmp_opt, 0);
 922
 923         if (want_cookie) {
 924                 tcp_clear_options(&tmp_opt);
 925                 tmp_opt.saw_tstamp = 0;
 926         }
 927
 928         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
 929                 /* Some OSes (unknown ones, but I see them on web server, which
 930                  * contains information interesting only for windows'
 931                  * users) do not send their stamp in SYN. It is easy case.
 932                  * We simply do not advertise TS support.
 933                  */
 934                 tmp_opt.saw_tstamp = 0;
 935                 tmp_opt.tstamp_ok  = 0;
 936         }
 937         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 938
 939         tcp_openreq_init(req, &tmp_opt, skb);
 940
 941         ireq = inet_rsk(req);
 942         ireq->loc_addr = daddr;
 943         ireq->rmt_addr = saddr;
 944         ireq->opt = tcp_v4_save_options(sk, skb);
 945         if (!want_cookie)
 946                 TCP_ECN_create_request(req, skb->h.th);
 947
 948         if (want_cookie) {
 949 #ifdef CONFIG_SYN_COOKIES
 950                 syn_flood_warning(skb);
 951 #endif
 952                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
 953         } else if (!isn) {
 954                 struct inet_peer *peer = NULL;
 955
 956                 /* VJ's idea. We save last timestamp seen
 957                  * from the destination in peer table, when entering
 958                  * state TIME-WAIT, and check against it before
 959                  * accepting new connection request.
 960                  *
 961                  * If "isn" is not zero, this request hit alive
 962                  * timewait bucket, so that all the necessary checks
 963                  * are made in the function processing timewait state.
 964                  */
 965                 if (tmp_opt.saw_tstamp &&
 966                     tcp_death_row.sysctl_tw_recycle &&
 967                     (dst = inet_csk_route_req(sk, req)) != NULL &&
 968                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 969                     peer->v4daddr == saddr) {
 970                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
 971                             (s32)(peer->tcp_ts - req->ts_recent) >
 972                                                         TCP_PAWS_WINDOW) {
 973                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
 974                                 dst_release(dst);
 975                                 goto drop_and_free;
 976                         }
 977                 }
 978                 /* Kill the following clause, if you dislike this way. */
 979                 else if (!sysctl_tcp_syncookies &&
 980                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 981                           (sysctl_max_syn_backlog >> 2)) &&
 982                          (!peer || !peer->tcp_ts_stamp) &&
 983                          (!dst || !dst_metric(dst, RTAX_RTT))) {
 984                         /* Without syncookies last quarter of
 985                          * backlog is filled with destinations,
 986                          * proven to be alive.
 987                          * It means that we continue to communicate
 988                          * to destinations, already remembered
 989                          * to the moment of synflood.
 990                          */
 991                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
 992                                        "request from %u.%u.%u.%u/%u\n",
 993                                        NIPQUAD(saddr),
 994                                        ntohs(skb->h.th->source));
 995                         dst_release(dst);
 996                         goto drop_and_free;
 997                 }
 998
 999                 isn = tcp_v4_init_sequence(sk, skb);
1000         }
1001         tcp_rsk(req)->snt_isn = isn;
1002
1003         if (tcp_v4_send_synack(sk, req, dst))
1004                 goto drop_and_free;
1005
1006         if (want_cookie) {
1007                 reqsk_free(req);
1008         } else {
1009                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1010         }
1011         return 0;
1012
1013 drop_and_free:
1014         reqsk_free(req);
1015 drop:
1016         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1017         return 0;
1018 }
1019
1020
1021 /*
1022  * The three way handshake has completed - we got a valid synack -
1023  * now create the new socket.
1024  */
1025 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1026                                   struct request_sock *req,
1027                                   struct dst_entry *dst)
1028 {
1029         struct inet_request_sock *ireq;
1030         struct inet_sock *newinet;
1031         struct tcp_sock *newtp;
1032         struct sock *newsk;
1033
1034         if (sk_acceptq_is_full(sk))
1035                 goto exit_overflow;
1036
1037         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1038                 goto exit;
1039
1040         newsk = tcp_create_openreq_child(sk, req, skb);
1041         if (!newsk)
1042                 goto exit;
1043
1044         sk_setup_caps(newsk, dst);
1045
1046         newtp                 = tcp_sk(newsk);
1047         newinet               = inet_sk(newsk);
1048         ireq                  = inet_rsk(req);
1049         newinet->daddr        = ireq->rmt_addr;
1050         newinet->rcv_saddr    = ireq->loc_addr;
1051         newinet->saddr        = ireq->loc_addr;
1052         newinet->opt          = ireq->opt;
1053         ireq->opt             = NULL;
1054         newinet->mc_index     = inet_iif(skb);
1055         newinet->mc_ttl       = skb->nh.iph->ttl;
1056         newtp->ext_header_len = 0;
1057         if (newinet->opt)
1058                 newtp->ext_header_len = newinet->opt->optlen;
1059         newinet->id = newtp->write_seq ^ jiffies;
1060
1061         tcp_sync_mss(newsk, dst_mtu(dst));
1062         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1063         tcp_initialize_rcv_mss(newsk);
1064
1065         __inet_hash(&tcp_hashinfo, newsk, 0);
1066         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1067
1068         return newsk;
1069
1070 exit_overflow:
1071         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1072 exit:
1073         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1074         dst_release(dst);
1075         return NULL;
1076 }
1077
1078 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1079 {
1080         struct tcphdr *th = skb->h.th;
1081         struct iphdr *iph = skb->nh.iph;
1082         struct sock *nsk;
1083         struct request_sock **prev;
1084         /* Find possible connection requests. */
1085         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1086                                                        iph->saddr, iph->daddr);
1087         if (req)
1088                 return tcp_check_req(sk, skb, req, prev);
1089
1090         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
1091                                         th->source, skb->nh.iph->daddr,
1092                                         ntohs(th->dest), inet_iif(skb));
1093
1094         if (nsk) {
1095                 if (nsk->sk_state != TCP_TIME_WAIT) {
1096                         bh_lock_sock(nsk);
1097                         return nsk;
1098                 }
1099                 inet_twsk_put((struct inet_timewait_sock *)nsk);
1100                 return NULL;
1101         }
1102
1103 #ifdef CONFIG_SYN_COOKIES
1104         if (!th->rst && !th->syn && th->ack)
1105                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1106 #endif
1107         return sk;
1108 }
1109
1110 static int tcp_v4_checksum_init(struct sk_buff *skb)
1111 {
1112         if (skb->ip_summed == CHECKSUM_HW) {
1113                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1114                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1115                                   skb->nh.iph->daddr, skb->csum))
1116                         return 0;
1117
1118                 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1119                 skb->ip_summed = CHECKSUM_NONE;
1120         }
1121         if (skb->len <= 76) {
1122                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1123                                  skb->nh.iph->daddr,
1124                                  skb_checksum(skb, 0, skb->len, 0)))
1125                         return -1;
1126                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1127         } else {
1128                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1129                                           skb->nh.iph->saddr,
1130                                           skb->nh.iph->daddr, 0);
1131         }
1132         return 0;
1133 }
1134
1135
1136 /* The socket must have it's spinlock held when we get
1137  * here.
1138  *
1139  * We have a potential double-lock case here, so even when
1140  * doing backlog processing we use the BH locking scheme.
1141  * This is because we cannot sleep with the original spinlock
1142  * held.
1143  */
1144 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1145 {
1146         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1147                 TCP_CHECK_TIMER(sk);
1148                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1149                         goto reset;
1150                 TCP_CHECK_TIMER(sk);
1151                 return 0;
1152         }
1153
1154         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1155                 goto csum_err;
1156
1157         if (sk->sk_state == TCP_LISTEN) {
1158                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1159                 if (!nsk)
1160                         goto discard;
1161
1162                 if (nsk != sk) {
1163                         if (tcp_child_process(sk, nsk, skb))
1164                                 goto reset;
1165                         return 0;
1166                 }
1167         }
1168
1169         TCP_CHECK_TIMER(sk);
1170         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1171                 goto reset;
1172         TCP_CHECK_TIMER(sk);
1173         return 0;
1174
1175 reset:
1176         tcp_v4_send_reset(skb);
1177 discard:
1178         kfree_skb(skb);
1179         /* Be careful here. If this function gets more complicated and
1180          * gcc suffers from register pressure on the x86, sk (in %ebx)
1181          * might be destroyed here. This current version compiles correctly,
1182          * but you have been warned.
1183          */
1184         return 0;
1185
1186 csum_err:
1187         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1188         goto discard;
1189 }
1190
1191 /*
1192  *      From tcp_input.c
1193  */
1194
1195 int tcp_v4_rcv(struct sk_buff *skb)
1196 {
1197         struct tcphdr *th;
1198         struct sock *sk;
1199         int ret;
1200
1201         if (skb->pkt_type != PACKET_HOST)
1202                 goto discard_it;
1203
1204         /* Count it even if it's bad */
1205         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1206
1207         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1208                 goto discard_it;
1209
1210         th = skb->h.th;
1211
1212         if (th->doff < sizeof(struct tcphdr) / 4)
1213                 goto bad_packet;
1214         if (!pskb_may_pull(skb, th->doff * 4))
1215                 goto discard_it;
1216
1217         /* An explanation is required here, I think.
1218          * Packet length and doff are validated by header prediction,
1219          * provided case of th->doff==0 is elimineted.
1220          * So, we defer the checks. */
1221         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1222              tcp_v4_checksum_init(skb) < 0))
1223                 goto bad_packet;
1224
1225         th = skb->h.th;
1226         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1227         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1228                                     skb->len - th->doff * 4);
1229         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1230         TCP_SKB_CB(skb)->when    = 0;
1231         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1232         TCP_SKB_CB(skb)->sacked  = 0;
1233
1234         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1235                            skb->nh.iph->daddr, ntohs(th->dest),
1236                            inet_iif(skb));
1237
1238         if (!sk)
1239                 goto no_tcp_socket;
1240
1241 process:
1242         if (sk->sk_state == TCP_TIME_WAIT)
1243                 goto do_time_wait;
1244
1245         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1246                 goto discard_and_relse;
1247
1248         if (sk_filter(sk, skb, 0))
1249                 goto discard_and_relse;
1250
1251         skb->dev = NULL;
1252
1253         bh_lock_sock(sk);
1254         ret = 0;
1255         if (!sock_owned_by_user(sk)) {
1256                 if (!tcp_prequeue(sk, skb))
1257                         ret = tcp_v4_do_rcv(sk, skb);
1258         } else
1259                 sk_add_backlog(sk, skb);
1260         bh_unlock_sock(sk);
1261
1262         sock_put(sk);
1263
1264         return ret;
1265
1266 no_tcp_socket:
1267         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1268                 goto discard_it;
1269
1270         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1271 bad_packet:
1272                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1273         } else {
1274                 tcp_v4_send_reset(skb);
1275         }
1276
1277 discard_it:
1278         /* Discard frame. */
1279         kfree_skb(skb);
1280         return 0;
1281
1282 discard_and_relse:
1283         sock_put(sk);
1284         goto discard_it;
1285
1286 do_time_wait:
1287         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1288                 inet_twsk_put((struct inet_timewait_sock *) sk);
1289                 goto discard_it;
1290         }
1291
1292         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1293                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1294                 inet_twsk_put((struct inet_timewait_sock *) sk);
1295                 goto discard_it;
1296         }
1297         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1298                                            skb, th)) {
1299         case TCP_TW_SYN: {
1300                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1301                                                         skb->nh.iph->daddr,
1302                                                         ntohs(th->dest),
1303                                                         inet_iif(skb));
1304                 if (sk2) {
1305                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1306                                              &tcp_death_row);
1307                         inet_twsk_put((struct inet_timewait_sock *)sk);
1308                         sk = sk2;
1309                         goto process;
1310                 }
1311                 /* Fall through to ACK */
1312         }
1313         case TCP_TW_ACK:
1314                 tcp_v4_timewait_ack(sk, skb);
1315                 break;
1316         case TCP_TW_RST:
1317                 goto no_tcp_socket;
1318         case TCP_TW_SUCCESS:;
1319         }
1320         goto discard_it;
1321 }
1322
1323 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1324 {
1325         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1326         struct inet_sock *inet = inet_sk(sk);
1327
1328         sin->sin_family         = AF_INET;
1329         sin->sin_addr.s_addr    = inet->daddr;
1330         sin->sin_port           = inet->dport;
1331 }
1332
1333 /* VJ's idea. Save last timestamp seen from this destination
1334  * and hold it at least for normal timewait interval to use for duplicate
1335  * segment detection in subsequent connections, before they enter synchronized
1336  * state.
1337  */
1338
1339 int tcp_v4_remember_stamp(struct sock *sk)
1340 {
1341         struct inet_sock *inet = inet_sk(sk);
1342         struct tcp_sock *tp = tcp_sk(sk);
1343         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1344         struct inet_peer *peer = NULL;
1345         int release_it = 0;
1346
1347         if (!rt || rt->rt_dst != inet->daddr) {
1348                 peer = inet_getpeer(inet->daddr, 1);
1349                 release_it = 1;
1350         } else {
1351                 if (!rt->peer)
1352                         rt_bind_peer(rt, 1);
1353                 peer = rt->peer;
1354         }
1355
1356         if (peer) {
1357                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1358                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1359                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1360                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1361                         peer->tcp_ts = tp->rx_opt.ts_recent;
1362                 }
1363                 if (release_it)
1364                         inet_putpeer(peer);
1365                 return 1;
1366         }
1367
1368         return 0;
1369 }
1370
1371 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1372 {
1373         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1374
1375         if (peer) {
1376                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1377
1378                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1379                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1380                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1381                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1382                         peer->tcp_ts       = tcptw->tw_ts_recent;
1383                 }
1384                 inet_putpeer(peer);
1385                 return 1;
1386         }
1387
1388         return 0;
1389 }
1390
1391 struct tcp_func ipv4_specific = {
1392         .queue_xmit     =       ip_queue_xmit,
1393         .send_check     =       tcp_v4_send_check,
1394         .rebuild_header =       inet_sk_rebuild_header,
1395         .conn_request   =       tcp_v4_conn_request,
1396         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1397         .remember_stamp =       tcp_v4_remember_stamp,
1398         .net_header_len =       sizeof(struct iphdr),
1399         .setsockopt     =       ip_setsockopt,
1400         .getsockopt     =       ip_getsockopt,
1401         .addr2sockaddr  =       v4_addr2sockaddr,
1402         .sockaddr_len   =       sizeof(struct sockaddr_in),
1403 };
1404
1405 /* NOTE: A lot of things set to zero explicitly by call to
1406  *       sk_alloc() so need not be done here.
1407  */
1408 static int tcp_v4_init_sock(struct sock *sk)
1409 {
1410         struct inet_connection_sock *icsk = inet_csk(sk);
1411         struct tcp_sock *tp = tcp_sk(sk);
1412
1413         skb_queue_head_init(&tp->out_of_order_queue);
1414         tcp_init_xmit_timers(sk);
1415         tcp_prequeue_init(tp);
1416
1417         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1418         tp->mdev = TCP_TIMEOUT_INIT;
1419
1420         /* So many TCP implementations out there (incorrectly) count the
1421          * initial SYN frame in their delayed-ACK and congestion control
1422          * algorithms that we must have the following bandaid to talk
1423          * efficiently to them.  -DaveM
1424          */
1425         tp->snd_cwnd = 2;
1426
1427         /* See draft-stevens-tcpca-spec-01 for discussion of the
1428          * initialization of these values.
1429          */
1430         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1431         tp->snd_cwnd_clamp = ~0;
1432         tp->mss_cache = 536;
1433
1434         tp->reordering = sysctl_tcp_reordering;
1435         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1436
1437         sk->sk_state = TCP_CLOSE;
1438
1439         sk->sk_write_space = sk_stream_write_space;
1440         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1441
1442         tp->af_specific = &ipv4_specific;
1443
1444         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1445         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1446
1447         atomic_inc(&tcp_sockets_allocated);
1448
1449         return 0;
1450 }
1451
1452 int tcp_v4_destroy_sock(struct sock *sk)
1453 {
1454         struct tcp_sock *tp = tcp_sk(sk);
1455
1456         tcp_clear_xmit_timers(sk);
1457
1458         tcp_cleanup_congestion_control(sk);
1459
1460         /* Cleanup up the write buffer. */
1461         sk_stream_writequeue_purge(sk);
1462
1463         /* Cleans up our, hopefully empty, out_of_order_queue. */
1464         __skb_queue_purge(&tp->out_of_order_queue);
1465
1466         /* Clean prequeue, it must be empty really */
1467         __skb_queue_purge(&tp->ucopy.prequeue);
1468
1469         /* Clean up a referenced TCP bind bucket. */
1470         if (inet_csk(sk)->icsk_bind_hash)
1471                 inet_put_port(&tcp_hashinfo, sk);
1472
1473         /*
1474          * If sendmsg cached page exists, toss it.
1475          */
1476         if (sk->sk_sndmsg_page) {
1477                 __free_page(sk->sk_sndmsg_page);
1478                 sk->sk_sndmsg_page = NULL;
1479         }
1480
1481         atomic_dec(&tcp_sockets_allocated);
1482
1483         return 0;
1484 }
1485
1486 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1487
1488 #ifdef CONFIG_PROC_FS
1489 /* Proc filesystem TCP sock list dumping. */
1490
1491 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1492 {
1493         return hlist_empty(head) ? NULL :
1494                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1495 }
1496
1497 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1498 {
1499         return tw->tw_node.next ?
1500                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1501 }
1502
1503 static void *listening_get_next(struct seq_file *seq, void *cur)
1504 {
1505         struct inet_connection_sock *icsk;
1506         struct hlist_node *node;
1507         struct sock *sk = cur;
1508         struct tcp_iter_state* st = seq->private;
1509
1510         if (!sk) {
1511                 st->bucket = 0;
1512                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1513                 goto get_sk;
1514         }
1515
1516         ++st->num;
1517
1518         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1519                 struct request_sock *req = cur;
1520
1521                 icsk = inet_csk(st->syn_wait_sk);
1522                 req = req->dl_next;
1523                 while (1) {
1524                         while (req) {
1525                                 if (req->rsk_ops->family == st->family) {
1526                                         cur = req;
1527                                         goto out;
1528                                 }
1529                                 req = req->dl_next;
1530                         }
1531                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1532                                 break;
1533 get_req:
1534                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1535                 }
1536                 sk        = sk_next(st->syn_wait_sk);
1537                 st->state = TCP_SEQ_STATE_LISTENING;
1538                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1539         } else {
1540                 icsk = inet_csk(sk);
1541                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1542                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1543                         goto start_req;
1544                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1545                 sk = sk_next(sk);
1546         }
1547 get_sk:
1548         sk_for_each_from(sk, node) {
1549                 if (sk->sk_family == st->family) {
1550                         cur = sk;
1551                         goto out;
1552                 }
1553                 icsk = inet_csk(sk);
1554                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1555                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1556 start_req:
1557                         st->uid         = sock_i_uid(sk);
1558                         st->syn_wait_sk = sk;
1559                         st->state       = TCP_SEQ_STATE_OPENREQ;
1560                         st->sbucket     = 0;
1561                         goto get_req;
1562                 }
1563                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1564         }
1565         if (++st->bucket < INET_LHTABLE_SIZE) {
1566                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1567                 goto get_sk;
1568         }
1569         cur = NULL;
1570 out:
1571         return cur;
1572 }
1573
1574 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1575 {
1576         void *rc = listening_get_next(seq, NULL);
1577
1578         while (rc && *pos) {
1579                 rc = listening_get_next(seq, rc);
1580                 --*pos;
1581         }
1582         return rc;
1583 }
1584
1585 static void *established_get_first(struct seq_file *seq)
1586 {
1587         struct tcp_iter_state* st = seq->private;
1588         void *rc = NULL;
1589
1590         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1591                 struct sock *sk;
1592                 struct hlist_node *node;
1593                 struct inet_timewait_sock *tw;
1594
1595                 /* We can reschedule _before_ having picked the target: */
1596                 cond_resched_softirq();
1597
1598                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1599                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1600                         if (sk->sk_family != st->family) {
1601                                 continue;
1602                         }
1603                         rc = sk;
1604                         goto out;
1605                 }
1606                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1607                 inet_twsk_for_each(tw, node,
1608                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1609                         if (tw->tw_family != st->family) {
1610                                 continue;
1611                         }
1612                         rc = tw;
1613                         goto out;
1614                 }
1615                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1616                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1617         }
1618 out:
1619         return rc;
1620 }
1621
1622 static void *established_get_next(struct seq_file *seq, void *cur)
1623 {
1624         struct sock *sk = cur;
1625         struct inet_timewait_sock *tw;
1626         struct hlist_node *node;
1627         struct tcp_iter_state* st = seq->private;
1628
1629         ++st->num;
1630
1631         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1632                 tw = cur;
1633                 tw = tw_next(tw);
1634 get_tw:
1635                 while (tw && tw->tw_family != st->family) {
1636                         tw = tw_next(tw);
1637                 }
1638                 if (tw) {
1639                         cur = tw;
1640                         goto out;
1641                 }
1642                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1643                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1644
1645                 /* We can reschedule between buckets: */
1646                 cond_resched_softirq();
1647
1648                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1649                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1650                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1651                 } else {
1652                         cur = NULL;
1653                         goto out;
1654                 }
1655         } else
1656                 sk = sk_next(sk);
1657
1658         sk_for_each_from(sk, node) {
1659                 if (sk->sk_family == st->family)
1660                         goto found;
1661         }
1662
1663         st->state = TCP_SEQ_STATE_TIME_WAIT;
1664         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1665         goto get_tw;
1666 found:
1667         cur = sk;
1668 out:
1669         return cur;
1670 }
1671
1672 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1673 {
1674         void *rc = established_get_first(seq);
1675
1676         while (rc && pos) {
1677                 rc = established_get_next(seq, rc);
1678                 --pos;
1679         }
1680         return rc;
1681 }
1682
1683 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1684 {
1685         void *rc;
1686         struct tcp_iter_state* st = seq->private;
1687
1688         inet_listen_lock(&tcp_hashinfo);
1689         st->state = TCP_SEQ_STATE_LISTENING;
1690         rc        = listening_get_idx(seq, &pos);
1691
1692         if (!rc) {
1693                 inet_listen_unlock(&tcp_hashinfo);
1694                 local_bh_disable();
1695                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1696                 rc        = established_get_idx(seq, pos);
1697         }
1698
1699         return rc;
1700 }
1701
1702 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1703 {
1704         struct tcp_iter_state* st = seq->private;
1705         st->state = TCP_SEQ_STATE_LISTENING;
1706         st->num = 0;
1707         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1708 }
1709
1710 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1711 {
1712         void *rc = NULL;
1713         struct tcp_iter_state* st;
1714
1715         if (v == SEQ_START_TOKEN) {
1716                 rc = tcp_get_idx(seq, 0);
1717                 goto out;
1718         }
1719         st = seq->private;
1720
1721         switch (st->state) {
1722         case TCP_SEQ_STATE_OPENREQ:
1723         case TCP_SEQ_STATE_LISTENING:
1724                 rc = listening_get_next(seq, v);
1725                 if (!rc) {
1726                         inet_listen_unlock(&tcp_hashinfo);
1727                         local_bh_disable();
1728                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1729                         rc        = established_get_first(seq);
1730                 }
1731                 break;
1732         case TCP_SEQ_STATE_ESTABLISHED:
1733         case TCP_SEQ_STATE_TIME_WAIT:
1734                 rc = established_get_next(seq, v);
1735                 break;
1736         }
1737 out:
1738         ++*pos;
1739         return rc;
1740 }
1741
1742 static void tcp_seq_stop(struct seq_file *seq, void *v)
1743 {
1744         struct tcp_iter_state* st = seq->private;
1745
1746         switch (st->state) {
1747         case TCP_SEQ_STATE_OPENREQ:
1748                 if (v) {
1749                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1750                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1751                 }
1752         case TCP_SEQ_STATE_LISTENING:
1753                 if (v != SEQ_START_TOKEN)
1754                         inet_listen_unlock(&tcp_hashinfo);
1755                 break;
1756         case TCP_SEQ_STATE_TIME_WAIT:
1757         case TCP_SEQ_STATE_ESTABLISHED:
1758                 if (v)
1759                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1760                 local_bh_enable();
1761                 break;
1762         }
1763 }
1764
1765 static int tcp_seq_open(struct inode *inode, struct file *file)
1766 {
1767         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1768         struct seq_file *seq;
1769         struct tcp_iter_state *s;
1770         int rc;
1771
1772         if (unlikely(afinfo == NULL))
1773                 return -EINVAL;
1774
1775         s = kmalloc(sizeof(*s), GFP_KERNEL);
1776         if (!s)
1777                 return -ENOMEM;
1778         memset(s, 0, sizeof(*s));
1779         s->family               = afinfo->family;
1780         s->seq_ops.start        = tcp_seq_start;
1781         s->seq_ops.next         = tcp_seq_next;
1782         s->seq_ops.show         = afinfo->seq_show;
1783         s->seq_ops.stop         = tcp_seq_stop;
1784
1785         rc = seq_open(file, &s->seq_ops);
1786         if (rc)
1787                 goto out_kfree;
1788         seq          = file->private_data;
1789         seq->private = s;
1790 out:
1791         return rc;
1792 out_kfree:
1793         kfree(s);
1794         goto out;
1795 }
1796
1797 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1798 {
1799         int rc = 0;
1800         struct proc_dir_entry *p;
1801
1802         if (!afinfo)
1803                 return -EINVAL;
1804         afinfo->seq_fops->owner         = afinfo->owner;
1805         afinfo->seq_fops->open          = tcp_seq_open;
1806         afinfo->seq_fops->read          = seq_read;
1807         afinfo->seq_fops->llseek        = seq_lseek;
1808         afinfo->seq_fops->release       = seq_release_private;
1809
1810         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1811         if (p)
1812                 p->data = afinfo;
1813         else
1814                 rc = -ENOMEM;
1815         return rc;
1816 }
1817
1818 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1819 {
1820         if (!afinfo)
1821                 return;
1822         proc_net_remove(afinfo->name);
1823         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
1824 }
1825
1826 static void get_openreq4(struct sock *sk, struct request_sock *req,
1827                          char *tmpbuf, int i, int uid)
1828 {
1829         const struct inet_request_sock *ireq = inet_rsk(req);
1830         int ttd = req->expires - jiffies;
1831
1832         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1833                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1834                 i,
1835                 ireq->loc_addr,
1836                 ntohs(inet_sk(sk)->sport),
1837                 ireq->rmt_addr,
1838                 ntohs(ireq->rmt_port),
1839                 TCP_SYN_RECV,
1840                 0, 0, /* could print option size, but that is af dependent. */
1841                 1,    /* timers active (only the expire timer) */
1842                 jiffies_to_clock_t(ttd),
1843                 req->retrans,
1844                 uid,
1845                 0,  /* non standard timer */
1846                 0, /* open_requests have no inode */
1847                 atomic_read(&sk->sk_refcnt),
1848                 req);
1849 }
1850
1851 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1852 {
1853         int timer_active;
1854         unsigned long timer_expires;
1855         struct tcp_sock *tp = tcp_sk(sp);
1856         const struct inet_connection_sock *icsk = inet_csk(sp);
1857         struct inet_sock *inet = inet_sk(sp);
1858         unsigned int dest = inet->daddr;
1859         unsigned int src = inet->rcv_saddr;
1860         __u16 destp = ntohs(inet->dport);
1861         __u16 srcp = ntohs(inet->sport);
1862
1863         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1864                 timer_active    = 1;
1865                 timer_expires   = icsk->icsk_timeout;
1866         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1867                 timer_active    = 4;
1868                 timer_expires   = icsk->icsk_timeout;
1869         } else if (timer_pending(&sp->sk_timer)) {
1870                 timer_active    = 2;
1871                 timer_expires   = sp->sk_timer.expires;
1872         } else {
1873                 timer_active    = 0;
1874                 timer_expires = jiffies;
1875         }
1876
1877         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1878                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1879                 i, src, srcp, dest, destp, sp->sk_state,
1880                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
1881                 timer_active,
1882                 jiffies_to_clock_t(timer_expires - jiffies),
1883                 icsk->icsk_retransmits,
1884                 sock_i_uid(sp),
1885                 icsk->icsk_probes_out,
1886                 sock_i_ino(sp),
1887                 atomic_read(&sp->sk_refcnt), sp,
1888                 icsk->icsk_rto,
1889                 icsk->icsk_ack.ato,
1890                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1891                 tp->snd_cwnd,
1892                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1893 }
1894
1895 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1896 {
1897         unsigned int dest, src;
1898         __u16 destp, srcp;
1899         int ttd = tw->tw_ttd - jiffies;
1900
1901         if (ttd < 0)
1902                 ttd = 0;
1903
1904         dest  = tw->tw_daddr;
1905         src   = tw->tw_rcv_saddr;
1906         destp = ntohs(tw->tw_dport);
1907         srcp  = ntohs(tw->tw_sport);
1908
1909         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1910                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1911                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1912                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1913                 atomic_read(&tw->tw_refcnt), tw);
1914 }
1915
1916 #define TMPSZ 150
1917
1918 static int tcp4_seq_show(struct seq_file *seq, void *v)
1919 {
1920         struct tcp_iter_state* st;
1921         char tmpbuf[TMPSZ + 1];
1922
1923         if (v == SEQ_START_TOKEN) {
1924                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1925                            "  sl  local_address rem_address   st tx_queue "
1926                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1927                            "inode");
1928                 goto out;
1929         }
1930         st = seq->private;
1931
1932         switch (st->state) {
1933         case TCP_SEQ_STATE_LISTENING:
1934         case TCP_SEQ_STATE_ESTABLISHED:
1935                 get_tcp4_sock(v, tmpbuf, st->num);
1936                 break;
1937         case TCP_SEQ_STATE_OPENREQ:
1938                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1939                 break;
1940         case TCP_SEQ_STATE_TIME_WAIT:
1941                 get_timewait4_sock(v, tmpbuf, st->num);
1942                 break;
1943         }
1944         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1945 out:
1946         return 0;
1947 }
1948
1949 static struct file_operations tcp4_seq_fops;
1950 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1951         .owner          = THIS_MODULE,
1952         .name           = "tcp",
1953         .family         = AF_INET,
1954         .seq_show       = tcp4_seq_show,
1955         .seq_fops       = &tcp4_seq_fops,
1956 };
1957
1958 int __init tcp4_proc_init(void)
1959 {
1960         return tcp_proc_register(&tcp4_seq_afinfo);
1961 }
1962
1963 void tcp4_proc_exit(void)
1964 {
1965         tcp_proc_unregister(&tcp4_seq_afinfo);
1966 }
1967 #endif /* CONFIG_PROC_FS */
1968
1969 struct proto tcp_prot = {
1970         .name                   = "TCP",
1971         .owner                  = THIS_MODULE,
1972         .close                  = tcp_close,
1973         .connect                = tcp_v4_connect,
1974         .disconnect             = tcp_disconnect,
1975         .accept                 = inet_csk_accept,
1976         .ioctl                  = tcp_ioctl,
1977         .init                   = tcp_v4_init_sock,
1978         .destroy                = tcp_v4_destroy_sock,
1979         .shutdown               = tcp_shutdown,
1980         .setsockopt             = tcp_setsockopt,
1981         .getsockopt             = tcp_getsockopt,
1982         .sendmsg                = tcp_sendmsg,
1983         .recvmsg                = tcp_recvmsg,
1984         .backlog_rcv            = tcp_v4_do_rcv,
1985         .hash                   = tcp_v4_hash,
1986         .unhash                 = tcp_unhash,
1987         .get_port               = tcp_v4_get_port,
1988         .enter_memory_pressure  = tcp_enter_memory_pressure,
1989         .sockets_allocated      = &tcp_sockets_allocated,
1990         .orphan_count           = &tcp_orphan_count,
1991         .memory_allocated       = &tcp_memory_allocated,
1992         .memory_pressure        = &tcp_memory_pressure,
1993         .sysctl_mem             = sysctl_tcp_mem,
1994         .sysctl_wmem            = sysctl_tcp_wmem,
1995         .sysctl_rmem            = sysctl_tcp_rmem,
1996         .max_header             = MAX_TCP_HEADER,
1997         .obj_size               = sizeof(struct tcp_sock),
1998         .twsk_obj_size          = sizeof(struct tcp_timewait_sock),
1999         .rsk_prot               = &tcp_request_sock_ops,
2000 };
2001
2002
2003
2004 void __init tcp_v4_init(struct net_proto_family *ops)
2005 {
2006         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2007         if (err < 0)
2008                 panic("Failed to create the TCP control socket.\n");
2009         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2010         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2011
2012         /* Unhash it so that IP input processing does not even
2013          * see it, we do not wish this socket to see incoming
2014          * packets.
2015          */
2016         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2017 }
2018
2019 EXPORT_SYMBOL(ipv4_specific);
2020 EXPORT_SYMBOL(inet_bind_bucket_create);
2021 EXPORT_SYMBOL(tcp_hashinfo);
2022 EXPORT_SYMBOL(tcp_prot);
2023 EXPORT_SYMBOL(tcp_unhash);
2024 EXPORT_SYMBOL(tcp_v4_conn_request);
2025 EXPORT_SYMBOL(tcp_v4_connect);
2026 EXPORT_SYMBOL(tcp_v4_do_rcv);
2027 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2028 EXPORT_SYMBOL(tcp_v4_send_check);
2029 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2030
2031 #ifdef CONFIG_PROC_FS
2032 EXPORT_SYMBOL(tcp_proc_register);
2033 EXPORT_SYMBOL(tcp_proc_unregister);
2034 #endif
2035 EXPORT_SYMBOL(sysctl_local_port_range);
2036 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2037 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2038