[INET]: Introduce inet_sk_rebuild_header
[linux-2.6] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/tcp.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/xfrm.h>
71
72 #include <linux/inet.h>
73 #include <linux/ipv6.h>
74 #include <linux/stddef.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77
78 extern int sysctl_ip_dynaddr;
79 int sysctl_tcp_tw_reuse;
80 int sysctl_tcp_low_latency;
81
82 /* Check TCP sequence numbers in ICMP packets. */
83 #define ICMP_MIN_LENGTH 8
84
85 /* Socket used for sending RSTs */
86 static struct socket *tcp_socket;
87
88 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
89                        struct sk_buff *skb);
90
91 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
92         .__tcp_lhash_lock       =       RW_LOCK_UNLOCKED,
93         .__tcp_lhash_users      =       ATOMIC_INIT(0),
94         .__tcp_lhash_wait
95           = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
96         .__tcp_portalloc_lock   =       SPIN_LOCK_UNLOCKED
97 };
98
99 /*
100  * This array holds the first and last local port number.
101  * For high-usage systems, use sysctl to change this to
102  * 32768-61000
103  */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106
107 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
108                                  __u32 faddr, __u16 fport)
109 {
110         int h = (laddr ^ lport) ^ (faddr ^ fport);
111         h ^= h >> 16;
112         h ^= h >> 8;
113         return h & (tcp_ehash_size - 1);
114 }
115
116 static __inline__ int tcp_sk_hashfn(struct sock *sk)
117 {
118         struct inet_sock *inet = inet_sk(sk);
119         __u32 laddr = inet->rcv_saddr;
120         __u16 lport = inet->num;
121         __u32 faddr = inet->daddr;
122         __u16 fport = inet->dport;
123
124         return tcp_hashfn(laddr, lport, faddr, fport);
125 }
126
127 /* Allocate and initialize a new TCP local port bind bucket.
128  * The bindhash mutex for snum's hash chain must be held here.
129  */
130 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
131                                           unsigned short snum)
132 {
133         struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
134                                                       SLAB_ATOMIC);
135         if (tb) {
136                 tb->port = snum;
137                 tb->fastreuse = 0;
138                 INIT_HLIST_HEAD(&tb->owners);
139                 hlist_add_head(&tb->node, &head->chain);
140         }
141         return tb;
142 }
143
144 /* Caller must hold hashbucket lock for this tb with local BH disabled */
145 void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
146 {
147         if (hlist_empty(&tb->owners)) {
148                 __hlist_del(&tb->node);
149                 kmem_cache_free(tcp_bucket_cachep, tb);
150         }
151 }
152
153 /* Caller must disable local BH processing. */
154 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
155 {
156         struct tcp_bind_hashbucket *head =
157                                 &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
158         struct tcp_bind_bucket *tb;
159
160         spin_lock(&head->lock);
161         tb = tcp_sk(sk)->bind_hash;
162         sk_add_bind_node(child, &tb->owners);
163         tcp_sk(child)->bind_hash = tb;
164         spin_unlock(&head->lock);
165 }
166
167 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
168 {
169         local_bh_disable();
170         __tcp_inherit_port(sk, child);
171         local_bh_enable();
172 }
173
174 void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
175                    unsigned short snum)
176 {
177         inet_sk(sk)->num = snum;
178         sk_add_bind_node(sk, &tb->owners);
179         tcp_sk(sk)->bind_hash = tb;
180 }
181
182 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
183 {
184         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
185         struct sock *sk2;
186         struct hlist_node *node;
187         int reuse = sk->sk_reuse;
188
189         sk_for_each_bound(sk2, node, &tb->owners) {
190                 if (sk != sk2 &&
191                     !tcp_v6_ipv6only(sk2) &&
192                     (!sk->sk_bound_dev_if ||
193                      !sk2->sk_bound_dev_if ||
194                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
195                         if (!reuse || !sk2->sk_reuse ||
196                             sk2->sk_state == TCP_LISTEN) {
197                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
198                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
199                                     sk2_rcv_saddr == sk_rcv_saddr)
200                                         break;
201                         }
202                 }
203         }
204         return node != NULL;
205 }
206
207 /* Obtain a reference to a local port for the given sock,
208  * if snum is zero it means select any available local port.
209  */
210 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
211 {
212         struct tcp_bind_hashbucket *head;
213         struct hlist_node *node;
214         struct tcp_bind_bucket *tb;
215         int ret;
216
217         local_bh_disable();
218         if (!snum) {
219                 int low = sysctl_local_port_range[0];
220                 int high = sysctl_local_port_range[1];
221                 int remaining = (high - low) + 1;
222                 int rover;
223
224                 spin_lock(&tcp_portalloc_lock);
225                 if (tcp_port_rover < low)
226                         rover = low;
227                 else
228                         rover = tcp_port_rover;
229                 do {
230                         rover++;
231                         if (rover > high)
232                                 rover = low;
233                         head = &tcp_bhash[tcp_bhashfn(rover)];
234                         spin_lock(&head->lock);
235                         tb_for_each(tb, node, &head->chain)
236                                 if (tb->port == rover)
237                                         goto next;
238                         break;
239                 next:
240                         spin_unlock(&head->lock);
241                 } while (--remaining > 0);
242                 tcp_port_rover = rover;
243                 spin_unlock(&tcp_portalloc_lock);
244
245                 /* Exhausted local port range during search?  It is not
246                  * possible for us to be holding one of the bind hash
247                  * locks if this test triggers, because if 'remaining'
248                  * drops to zero, we broke out of the do/while loop at
249                  * the top level, not from the 'break;' statement.
250                  */
251                 ret = 1;
252                 if (unlikely(remaining <= 0))
253                         goto fail;
254
255                 /* OK, here is the one we will use.  HEAD is
256                  * non-NULL and we hold it's mutex.
257                  */
258                 snum = rover;
259         } else {
260                 head = &tcp_bhash[tcp_bhashfn(snum)];
261                 spin_lock(&head->lock);
262                 tb_for_each(tb, node, &head->chain)
263                         if (tb->port == snum)
264                                 goto tb_found;
265         }
266         tb = NULL;
267         goto tb_not_found;
268 tb_found:
269         if (!hlist_empty(&tb->owners)) {
270                 if (sk->sk_reuse > 1)
271                         goto success;
272                 if (tb->fastreuse > 0 &&
273                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
274                         goto success;
275                 } else {
276                         ret = 1;
277                         if (tcp_bind_conflict(sk, tb))
278                                 goto fail_unlock;
279                 }
280         }
281 tb_not_found:
282         ret = 1;
283         if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
284                 goto fail_unlock;
285         if (hlist_empty(&tb->owners)) {
286                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
287                         tb->fastreuse = 1;
288                 else
289                         tb->fastreuse = 0;
290         } else if (tb->fastreuse &&
291                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
292                 tb->fastreuse = 0;
293 success:
294         if (!tcp_sk(sk)->bind_hash)
295                 tcp_bind_hash(sk, tb, snum);
296         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
297         ret = 0;
298
299 fail_unlock:
300         spin_unlock(&head->lock);
301 fail:
302         local_bh_enable();
303         return ret;
304 }
305
306 /* Get rid of any references to a local port held by the
307  * given sock.
308  */
309 static void __tcp_put_port(struct sock *sk)
310 {
311         struct inet_sock *inet = inet_sk(sk);
312         struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
313         struct tcp_bind_bucket *tb;
314
315         spin_lock(&head->lock);
316         tb = tcp_sk(sk)->bind_hash;
317         __sk_del_bind_node(sk);
318         tcp_sk(sk)->bind_hash = NULL;
319         inet->num = 0;
320         tcp_bucket_destroy(tb);
321         spin_unlock(&head->lock);
322 }
323
324 void tcp_put_port(struct sock *sk)
325 {
326         local_bh_disable();
327         __tcp_put_port(sk);
328         local_bh_enable();
329 }
330
331 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
332  * Look, when several writers sleep and reader wakes them up, all but one
333  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
334  * this, _but_ remember, it adds useless work on UP machines (wake up each
335  * exclusive lock release). It should be ifdefed really.
336  */
337
338 void tcp_listen_wlock(void)
339 {
340         write_lock(&tcp_lhash_lock);
341
342         if (atomic_read(&tcp_lhash_users)) {
343                 DEFINE_WAIT(wait);
344
345                 for (;;) {
346                         prepare_to_wait_exclusive(&tcp_lhash_wait,
347                                                 &wait, TASK_UNINTERRUPTIBLE);
348                         if (!atomic_read(&tcp_lhash_users))
349                                 break;
350                         write_unlock_bh(&tcp_lhash_lock);
351                         schedule();
352                         write_lock_bh(&tcp_lhash_lock);
353                 }
354
355                 finish_wait(&tcp_lhash_wait, &wait);
356         }
357 }
358
359 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
360 {
361         struct hlist_head *list;
362         rwlock_t *lock;
363
364         BUG_TRAP(sk_unhashed(sk));
365         if (listen_possible && sk->sk_state == TCP_LISTEN) {
366                 list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
367                 lock = &tcp_lhash_lock;
368                 tcp_listen_wlock();
369         } else {
370                 list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
371                 lock = &tcp_ehash[sk->sk_hashent].lock;
372                 write_lock(lock);
373         }
374         __sk_add_node(sk, list);
375         sock_prot_inc_use(sk->sk_prot);
376         write_unlock(lock);
377         if (listen_possible && sk->sk_state == TCP_LISTEN)
378                 wake_up(&tcp_lhash_wait);
379 }
380
381 static void tcp_v4_hash(struct sock *sk)
382 {
383         if (sk->sk_state != TCP_CLOSE) {
384                 local_bh_disable();
385                 __tcp_v4_hash(sk, 1);
386                 local_bh_enable();
387         }
388 }
389
390 void tcp_unhash(struct sock *sk)
391 {
392         rwlock_t *lock;
393
394         if (sk_unhashed(sk))
395                 goto ende;
396
397         if (sk->sk_state == TCP_LISTEN) {
398                 local_bh_disable();
399                 tcp_listen_wlock();
400                 lock = &tcp_lhash_lock;
401         } else {
402                 struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
403                 lock = &head->lock;
404                 write_lock_bh(&head->lock);
405         }
406
407         if (__sk_del_node_init(sk))
408                 sock_prot_dec_use(sk->sk_prot);
409         write_unlock_bh(lock);
410
411  ende:
412         if (sk->sk_state == TCP_LISTEN)
413                 wake_up(&tcp_lhash_wait);
414 }
415
416 /* Don't inline this cruft.  Here are some nice properties to
417  * exploit here.  The BSD API does not allow a listening TCP
418  * to specify the remote port nor the remote address for the
419  * connection.  So always assume those are both wildcarded
420  * during the search since they can never be otherwise.
421  */
422 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
423                                              unsigned short hnum, int dif)
424 {
425         struct sock *result = NULL, *sk;
426         struct hlist_node *node;
427         int score, hiscore;
428
429         hiscore=-1;
430         sk_for_each(sk, node, head) {
431                 struct inet_sock *inet = inet_sk(sk);
432
433                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
434                         __u32 rcv_saddr = inet->rcv_saddr;
435
436                         score = (sk->sk_family == PF_INET ? 1 : 0);
437                         if (rcv_saddr) {
438                                 if (rcv_saddr != daddr)
439                                         continue;
440                                 score+=2;
441                         }
442                         if (sk->sk_bound_dev_if) {
443                                 if (sk->sk_bound_dev_if != dif)
444                                         continue;
445                                 score+=2;
446                         }
447                         if (score == 5)
448                                 return sk;
449                         if (score > hiscore) {
450                                 hiscore = score;
451                                 result = sk;
452                         }
453                 }
454         }
455         return result;
456 }
457
458 /* Optimize the common listener case. */
459 static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
460                 unsigned short hnum, int dif)
461 {
462         struct sock *sk = NULL;
463         struct hlist_head *head;
464
465         read_lock(&tcp_lhash_lock);
466         head = &tcp_listening_hash[tcp_lhashfn(hnum)];
467         if (!hlist_empty(head)) {
468                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
469
470                 if (inet->num == hnum && !sk->sk_node.next &&
471                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
472                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
473                     !sk->sk_bound_dev_if)
474                         goto sherry_cache;
475                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
476         }
477         if (sk) {
478 sherry_cache:
479                 sock_hold(sk);
480         }
481         read_unlock(&tcp_lhash_lock);
482         return sk;
483 }
484
485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487  *
488  * Local BH must be disabled here.
489  */
490
491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492                                                        u32 daddr, u16 hnum,
493                                                        int dif)
494 {
495         struct tcp_ehash_bucket *head;
496         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
497         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
498         struct sock *sk;
499         struct hlist_node *node;
500         /* Optimize here for direct hit, only listening connections can
501          * have wildcards anyways.
502          */
503         int hash = tcp_hashfn(daddr, hnum, saddr, sport);
504         head = &tcp_ehash[hash];
505         read_lock(&head->lock);
506         sk_for_each(sk, node, &head->chain) {
507                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508                         goto hit; /* You sunk my battleship! */
509         }
510
511         /* Must check for a TIME_WAIT'er before going to listener hash. */
512         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
513                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
514                         goto hit;
515         }
516         sk = NULL;
517 out:
518         read_unlock(&head->lock);
519         return sk;
520 hit:
521         sock_hold(sk);
522         goto out;
523 }
524
525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526                                            u32 daddr, u16 hnum, int dif)
527 {
528         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
529                                                       daddr, hnum, dif);
530
531         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
532 }
533
534 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
535                                   u16 dport, int dif)
536 {
537         struct sock *sk;
538
539         local_bh_disable();
540         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
541         local_bh_enable();
542
543         return sk;
544 }
545
546 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
547
548 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
549 {
550         return secure_tcp_sequence_number(skb->nh.iph->daddr,
551                                           skb->nh.iph->saddr,
552                                           skb->h.th->dest,
553                                           skb->h.th->source);
554 }
555
556 /* called with local bh disabled */
557 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
558                                       struct tcp_tw_bucket **twp)
559 {
560         struct inet_sock *inet = inet_sk(sk);
561         u32 daddr = inet->rcv_saddr;
562         u32 saddr = inet->daddr;
563         int dif = sk->sk_bound_dev_if;
564         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
566         int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
567         struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568         struct sock *sk2;
569         struct hlist_node *node;
570         struct tcp_tw_bucket *tw;
571
572         write_lock(&head->lock);
573
574         /* Check TIME-WAIT sockets first. */
575         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
576                 tw = (struct tcp_tw_bucket *)sk2;
577
578                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579                         struct tcp_sock *tp = tcp_sk(sk);
580
581                         /* With PAWS, it is safe from the viewpoint
582                            of data integrity. Even without PAWS it
583                            is safe provided sequence spaces do not
584                            overlap i.e. at data rates <= 80Mbit/sec.
585
586                            Actually, the idea is close to VJ's one,
587                            only timestamp cache is held not per host,
588                            but per port pair and TW bucket is used
589                            as state holder.
590
591                            If TW bucket has been already destroyed we
592                            fall back to VJ's scheme and use initial
593                            timestamp retrieved from peer table.
594                          */
595                         if (tw->tw_ts_recent_stamp &&
596                             (!twp || (sysctl_tcp_tw_reuse &&
597                                       xtime.tv_sec -
598                                       tw->tw_ts_recent_stamp > 1))) {
599                                 if ((tp->write_seq =
600                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
601                                         tp->write_seq = 1;
602                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
603                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
604                                 sock_hold(sk2);
605                                 goto unique;
606                         } else
607                                 goto not_unique;
608                 }
609         }
610         tw = NULL;
611
612         /* And established part... */
613         sk_for_each(sk2, node, &head->chain) {
614                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
615                         goto not_unique;
616         }
617
618 unique:
619         /* Must record num and sport now. Otherwise we will see
620          * in hash table socket with a funny identity. */
621         inet->num = lport;
622         inet->sport = htons(lport);
623         sk->sk_hashent = hash;
624         BUG_TRAP(sk_unhashed(sk));
625         __sk_add_node(sk, &head->chain);
626         sock_prot_inc_use(sk->sk_prot);
627         write_unlock(&head->lock);
628
629         if (twp) {
630                 *twp = tw;
631                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
632         } else if (tw) {
633                 /* Silly. Should hash-dance instead... */
634                 tcp_tw_deschedule(tw);
635                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
636
637                 tcp_tw_put(tw);
638         }
639
640         return 0;
641
642 not_unique:
643         write_unlock(&head->lock);
644         return -EADDRNOTAVAIL;
645 }
646
647 static inline u32 connect_port_offset(const struct sock *sk)
648 {
649         const struct inet_sock *inet = inet_sk(sk);
650
651         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
652                                          inet->dport);
653 }
654
655 /*
656  * Bind a port for a connect operation and hash it.
657  */
658 static inline int tcp_v4_hash_connect(struct sock *sk)
659 {
660         unsigned short snum = inet_sk(sk)->num;
661         struct tcp_bind_hashbucket *head;
662         struct tcp_bind_bucket *tb;
663         int ret;
664
665         if (!snum) {
666                 int low = sysctl_local_port_range[0];
667                 int high = sysctl_local_port_range[1];
668                 int range = high - low;
669                 int i;
670                 int port;
671                 static u32 hint;
672                 u32 offset = hint + connect_port_offset(sk);
673                 struct hlist_node *node;
674                 struct tcp_tw_bucket *tw = NULL;
675
676                 local_bh_disable();
677                 for (i = 1; i <= range; i++) {
678                         port = low + (i + offset) % range;
679                         head = &tcp_bhash[tcp_bhashfn(port)];
680                         spin_lock(&head->lock);
681
682                         /* Does not bother with rcv_saddr checks,
683                          * because the established check is already
684                          * unique enough.
685                          */
686                         tb_for_each(tb, node, &head->chain) {
687                                 if (tb->port == port) {
688                                         BUG_TRAP(!hlist_empty(&tb->owners));
689                                         if (tb->fastreuse >= 0)
690                                                 goto next_port;
691                                         if (!__tcp_v4_check_established(sk,
692                                                                         port,
693                                                                         &tw))
694                                                 goto ok;
695                                         goto next_port;
696                                 }
697                         }
698
699                         tb = tcp_bucket_create(head, port);
700                         if (!tb) {
701                                 spin_unlock(&head->lock);
702                                 break;
703                         }
704                         tb->fastreuse = -1;
705                         goto ok;
706
707                 next_port:
708                         spin_unlock(&head->lock);
709                 }
710                 local_bh_enable();
711
712                 return -EADDRNOTAVAIL;
713
714 ok:
715                 hint += i;
716
717                 /* Head lock still held and bh's disabled */
718                 tcp_bind_hash(sk, tb, port);
719                 if (sk_unhashed(sk)) {
720                         inet_sk(sk)->sport = htons(port);
721                         __tcp_v4_hash(sk, 0);
722                 }
723                 spin_unlock(&head->lock);
724
725                 if (tw) {
726                         tcp_tw_deschedule(tw);
727                         tcp_tw_put(tw);
728                 }
729
730                 ret = 0;
731                 goto out;
732         }
733
734         head  = &tcp_bhash[tcp_bhashfn(snum)];
735         tb  = tcp_sk(sk)->bind_hash;
736         spin_lock_bh(&head->lock);
737         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
738                 __tcp_v4_hash(sk, 0);
739                 spin_unlock_bh(&head->lock);
740                 return 0;
741         } else {
742                 spin_unlock(&head->lock);
743                 /* No definite answer... Walk to established hash table */
744                 ret = __tcp_v4_check_established(sk, snum, NULL);
745 out:
746                 local_bh_enable();
747                 return ret;
748         }
749 }
750
751 /* This will initiate an outgoing connection. */
752 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
753 {
754         struct inet_sock *inet = inet_sk(sk);
755         struct tcp_sock *tp = tcp_sk(sk);
756         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
757         struct rtable *rt;
758         u32 daddr, nexthop;
759         int tmp;
760         int err;
761
762         if (addr_len < sizeof(struct sockaddr_in))
763                 return -EINVAL;
764
765         if (usin->sin_family != AF_INET)
766                 return -EAFNOSUPPORT;
767
768         nexthop = daddr = usin->sin_addr.s_addr;
769         if (inet->opt && inet->opt->srr) {
770                 if (!daddr)
771                         return -EINVAL;
772                 nexthop = inet->opt->faddr;
773         }
774
775         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
776                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
777                                IPPROTO_TCP,
778                                inet->sport, usin->sin_port, sk);
779         if (tmp < 0)
780                 return tmp;
781
782         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
783                 ip_rt_put(rt);
784                 return -ENETUNREACH;
785         }
786
787         if (!inet->opt || !inet->opt->srr)
788                 daddr = rt->rt_dst;
789
790         if (!inet->saddr)
791                 inet->saddr = rt->rt_src;
792         inet->rcv_saddr = inet->saddr;
793
794         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
795                 /* Reset inherited state */
796                 tp->rx_opt.ts_recent       = 0;
797                 tp->rx_opt.ts_recent_stamp = 0;
798                 tp->write_seq              = 0;
799         }
800
801         if (sysctl_tcp_tw_recycle &&
802             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
803                 struct inet_peer *peer = rt_get_peer(rt);
804
805                 /* VJ's idea. We save last timestamp seen from
806                  * the destination in peer table, when entering state TIME-WAIT
807                  * and initialize rx_opt.ts_recent from it, when trying new connection.
808                  */
809
810                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
811                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
812                         tp->rx_opt.ts_recent = peer->tcp_ts;
813                 }
814         }
815
816         inet->dport = usin->sin_port;
817         inet->daddr = daddr;
818
819         tp->ext_header_len = 0;
820         if (inet->opt)
821                 tp->ext_header_len = inet->opt->optlen;
822
823         tp->rx_opt.mss_clamp = 536;
824
825         /* Socket identity is still unknown (sport may be zero).
826          * However we set state to SYN-SENT and not releasing socket
827          * lock select source port, enter ourselves into the hash tables and
828          * complete initialization after this.
829          */
830         tcp_set_state(sk, TCP_SYN_SENT);
831         err = tcp_v4_hash_connect(sk);
832         if (err)
833                 goto failure;
834
835         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
836         if (err)
837                 goto failure;
838
839         /* OK, now commit destination to socket.  */
840         sk_setup_caps(sk, &rt->u.dst);
841
842         if (!tp->write_seq)
843                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
844                                                            inet->daddr,
845                                                            inet->sport,
846                                                            usin->sin_port);
847
848         inet->id = tp->write_seq ^ jiffies;
849
850         err = tcp_connect(sk);
851         rt = NULL;
852         if (err)
853                 goto failure;
854
855         return 0;
856
857 failure:
858         /* This unhashes the socket and releases the local port, if necessary. */
859         tcp_set_state(sk, TCP_CLOSE);
860         ip_rt_put(rt);
861         sk->sk_route_caps = 0;
862         inet->dport = 0;
863         return err;
864 }
865
866 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
867 {
868         return ((struct rtable *)skb->dst)->rt_iif;
869 }
870
871 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
872 {
873         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
874 }
875
876 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
877                                               struct request_sock ***prevp,
878                                               __u16 rport,
879                                               __u32 raddr, __u32 laddr)
880 {
881         struct listen_sock *lopt = tp->accept_queue.listen_opt;
882         struct request_sock *req, **prev;
883
884         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
885              (req = *prev) != NULL;
886              prev = &req->dl_next) {
887                 const struct inet_request_sock *ireq = inet_rsk(req);
888
889                 if (ireq->rmt_port == rport &&
890                     ireq->rmt_addr == raddr &&
891                     ireq->loc_addr == laddr &&
892                     TCP_INET_FAMILY(req->rsk_ops->family)) {
893                         BUG_TRAP(!req->sk);
894                         *prevp = prev;
895                         break;
896                 }
897         }
898
899         return req;
900 }
901
902 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
903 {
904         struct tcp_sock *tp = tcp_sk(sk);
905         struct listen_sock *lopt = tp->accept_queue.listen_opt;
906         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
907
908         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
909         tcp_synq_added(sk);
910 }
911
912
913 /*
914  * This routine does path mtu discovery as defined in RFC1191.
915  */
916 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
917                                      u32 mtu)
918 {
919         struct dst_entry *dst;
920         struct inet_sock *inet = inet_sk(sk);
921         struct tcp_sock *tp = tcp_sk(sk);
922
923         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
924          * send out by Linux are always <576bytes so they should go through
925          * unfragmented).
926          */
927         if (sk->sk_state == TCP_LISTEN)
928                 return;
929
930         /* We don't check in the destentry if pmtu discovery is forbidden
931          * on this route. We just assume that no packet_to_big packets
932          * are send back when pmtu discovery is not active.
933          * There is a small race when the user changes this flag in the
934          * route, but I think that's acceptable.
935          */
936         if ((dst = __sk_dst_check(sk, 0)) == NULL)
937                 return;
938
939         dst->ops->update_pmtu(dst, mtu);
940
941         /* Something is about to be wrong... Remember soft error
942          * for the case, if this connection will not able to recover.
943          */
944         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
945                 sk->sk_err_soft = EMSGSIZE;
946
947         mtu = dst_mtu(dst);
948
949         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
950             tp->pmtu_cookie > mtu) {
951                 tcp_sync_mss(sk, mtu);
952
953                 /* Resend the TCP packet because it's
954                  * clear that the old packet has been
955                  * dropped. This is the new "fast" path mtu
956                  * discovery.
957                  */
958                 tcp_simple_retransmit(sk);
959         } /* else let the usual retransmit timer handle it */
960 }
961
962 /*
963  * This routine is called by the ICMP module when it gets some
964  * sort of error condition.  If err < 0 then the socket should
965  * be closed and the error returned to the user.  If err > 0
966  * it's just the icmp type << 8 | icmp code.  After adjustment
967  * header points to the first 8 bytes of the tcp header.  We need
968  * to find the appropriate port.
969  *
970  * The locking strategy used here is very "optimistic". When
971  * someone else accesses the socket the ICMP is just dropped
972  * and for some paths there is no check at all.
973  * A more general error queue to queue errors for later handling
974  * is probably better.
975  *
976  */
977
978 void tcp_v4_err(struct sk_buff *skb, u32 info)
979 {
980         struct iphdr *iph = (struct iphdr *)skb->data;
981         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
982         struct tcp_sock *tp;
983         struct inet_sock *inet;
984         int type = skb->h.icmph->type;
985         int code = skb->h.icmph->code;
986         struct sock *sk;
987         __u32 seq;
988         int err;
989
990         if (skb->len < (iph->ihl << 2) + 8) {
991                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
992                 return;
993         }
994
995         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
996                            th->source, tcp_v4_iif(skb));
997         if (!sk) {
998                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
999                 return;
1000         }
1001         if (sk->sk_state == TCP_TIME_WAIT) {
1002                 tcp_tw_put((struct tcp_tw_bucket *)sk);
1003                 return;
1004         }
1005
1006         bh_lock_sock(sk);
1007         /* If too many ICMPs get dropped on busy
1008          * servers this needs to be solved differently.
1009          */
1010         if (sock_owned_by_user(sk))
1011                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
1012
1013         if (sk->sk_state == TCP_CLOSE)
1014                 goto out;
1015
1016         tp = tcp_sk(sk);
1017         seq = ntohl(th->seq);
1018         if (sk->sk_state != TCP_LISTEN &&
1019             !between(seq, tp->snd_una, tp->snd_nxt)) {
1020                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
1021                 goto out;
1022         }
1023
1024         switch (type) {
1025         case ICMP_SOURCE_QUENCH:
1026                 /* Just silently ignore these. */
1027                 goto out;
1028         case ICMP_PARAMETERPROB:
1029                 err = EPROTO;
1030                 break;
1031         case ICMP_DEST_UNREACH:
1032                 if (code > NR_ICMP_UNREACH)
1033                         goto out;
1034
1035                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1036                         if (!sock_owned_by_user(sk))
1037                                 do_pmtu_discovery(sk, iph, info);
1038                         goto out;
1039                 }
1040
1041                 err = icmp_err_convert[code].errno;
1042                 break;
1043         case ICMP_TIME_EXCEEDED:
1044                 err = EHOSTUNREACH;
1045                 break;
1046         default:
1047                 goto out;
1048         }
1049
1050         switch (sk->sk_state) {
1051                 struct request_sock *req, **prev;
1052         case TCP_LISTEN:
1053                 if (sock_owned_by_user(sk))
1054                         goto out;
1055
1056                 req = tcp_v4_search_req(tp, &prev, th->dest,
1057                                         iph->daddr, iph->saddr);
1058                 if (!req)
1059                         goto out;
1060
1061                 /* ICMPs are not backlogged, hence we cannot get
1062                    an established socket here.
1063                  */
1064                 BUG_TRAP(!req->sk);
1065
1066                 if (seq != tcp_rsk(req)->snt_isn) {
1067                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1068                         goto out;
1069                 }
1070
1071                 /*
1072                  * Still in SYN_RECV, just remove it silently.
1073                  * There is no good way to pass the error to the newly
1074                  * created socket, and POSIX does not want network
1075                  * errors returned from accept().
1076                  */
1077                 tcp_synq_drop(sk, req, prev);
1078                 goto out;
1079
1080         case TCP_SYN_SENT:
1081         case TCP_SYN_RECV:  /* Cannot happen.
1082                                It can f.e. if SYNs crossed.
1083                              */
1084                 if (!sock_owned_by_user(sk)) {
1085                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1086                         sk->sk_err = err;
1087
1088                         sk->sk_error_report(sk);
1089
1090                         tcp_done(sk);
1091                 } else {
1092                         sk->sk_err_soft = err;
1093                 }
1094                 goto out;
1095         }
1096
1097         /* If we've already connected we will keep trying
1098          * until we time out, or the user gives up.
1099          *
1100          * rfc1122 4.2.3.9 allows to consider as hard errors
1101          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1102          * but it is obsoleted by pmtu discovery).
1103          *
1104          * Note, that in modern internet, where routing is unreliable
1105          * and in each dark corner broken firewalls sit, sending random
1106          * errors ordered by their masters even this two messages finally lose
1107          * their original sense (even Linux sends invalid PORT_UNREACHs)
1108          *
1109          * Now we are in compliance with RFCs.
1110          *                                                      --ANK (980905)
1111          */
1112
1113         inet = inet_sk(sk);
1114         if (!sock_owned_by_user(sk) && inet->recverr) {
1115                 sk->sk_err = err;
1116                 sk->sk_error_report(sk);
1117         } else  { /* Only an error on timeout */
1118                 sk->sk_err_soft = err;
1119         }
1120
1121 out:
1122         bh_unlock_sock(sk);
1123         sock_put(sk);
1124 }
1125
1126 /* This routine computes an IPv4 TCP checksum. */
1127 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1128                        struct sk_buff *skb)
1129 {
1130         struct inet_sock *inet = inet_sk(sk);
1131
1132         if (skb->ip_summed == CHECKSUM_HW) {
1133                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1134                 skb->csum = offsetof(struct tcphdr, check);
1135         } else {
1136                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1137                                          csum_partial((char *)th,
1138                                                       th->doff << 2,
1139                                                       skb->csum));
1140         }
1141 }
1142
1143 /*
1144  *      This routine will send an RST to the other tcp.
1145  *
1146  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1147  *                    for reset.
1148  *      Answer: if a packet caused RST, it is not for a socket
1149  *              existing in our system, if it is matched to a socket,
1150  *              it is just duplicate segment or bug in other side's TCP.
1151  *              So that we build reply only basing on parameters
1152  *              arrived with segment.
1153  *      Exception: precedence violation. We do not implement it in any case.
1154  */
1155
1156 static void tcp_v4_send_reset(struct sk_buff *skb)
1157 {
1158         struct tcphdr *th = skb->h.th;
1159         struct tcphdr rth;
1160         struct ip_reply_arg arg;
1161
1162         /* Never send a reset in response to a reset. */
1163         if (th->rst)
1164                 return;
1165
1166         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1167                 return;
1168
1169         /* Swap the send and the receive. */
1170         memset(&rth, 0, sizeof(struct tcphdr));
1171         rth.dest   = th->source;
1172         rth.source = th->dest;
1173         rth.doff   = sizeof(struct tcphdr) / 4;
1174         rth.rst    = 1;
1175
1176         if (th->ack) {
1177                 rth.seq = th->ack_seq;
1178         } else {
1179                 rth.ack = 1;
1180                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1181                                     skb->len - (th->doff << 2));
1182         }
1183
1184         memset(&arg, 0, sizeof arg);
1185         arg.iov[0].iov_base = (unsigned char *)&rth;
1186         arg.iov[0].iov_len  = sizeof rth;
1187         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1188                                       skb->nh.iph->saddr, /*XXX*/
1189                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1190         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1191
1192         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1193
1194         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1195         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1196 }
1197
1198 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1199    outside socket context is ugly, certainly. What can I do?
1200  */
1201
1202 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1203                             u32 win, u32 ts)
1204 {
1205         struct tcphdr *th = skb->h.th;
1206         struct {
1207                 struct tcphdr th;
1208                 u32 tsopt[3];
1209         } rep;
1210         struct ip_reply_arg arg;
1211
1212         memset(&rep.th, 0, sizeof(struct tcphdr));
1213         memset(&arg, 0, sizeof arg);
1214
1215         arg.iov[0].iov_base = (unsigned char *)&rep;
1216         arg.iov[0].iov_len  = sizeof(rep.th);
1217         if (ts) {
1218                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1219                                      (TCPOPT_TIMESTAMP << 8) |
1220                                      TCPOLEN_TIMESTAMP);
1221                 rep.tsopt[1] = htonl(tcp_time_stamp);
1222                 rep.tsopt[2] = htonl(ts);
1223                 arg.iov[0].iov_len = sizeof(rep);
1224         }
1225
1226         /* Swap the send and the receive. */
1227         rep.th.dest    = th->source;
1228         rep.th.source  = th->dest;
1229         rep.th.doff    = arg.iov[0].iov_len / 4;
1230         rep.th.seq     = htonl(seq);
1231         rep.th.ack_seq = htonl(ack);
1232         rep.th.ack     = 1;
1233         rep.th.window  = htons(win);
1234
1235         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1236                                       skb->nh.iph->saddr, /*XXX*/
1237                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1238         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1239
1240         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1241
1242         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1243 }
1244
1245 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1246 {
1247         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1248
1249         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1250                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1251
1252         tcp_tw_put(tw);
1253 }
1254
1255 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1256 {
1257         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1258                         req->ts_recent);
1259 }
1260
1261 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1262                                           struct request_sock *req)
1263 {
1264         struct rtable *rt;
1265         const struct inet_request_sock *ireq = inet_rsk(req);
1266         struct ip_options *opt = inet_rsk(req)->opt;
1267         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1268                             .nl_u = { .ip4_u =
1269                                       { .daddr = ((opt && opt->srr) ?
1270                                                   opt->faddr :
1271                                                   ireq->rmt_addr),
1272                                         .saddr = ireq->loc_addr,
1273                                         .tos = RT_CONN_FLAGS(sk) } },
1274                             .proto = IPPROTO_TCP,
1275                             .uli_u = { .ports =
1276                                        { .sport = inet_sk(sk)->sport,
1277                                          .dport = ireq->rmt_port } } };
1278
1279         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1280                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1281                 return NULL;
1282         }
1283         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1284                 ip_rt_put(rt);
1285                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1286                 return NULL;
1287         }
1288         return &rt->u.dst;
1289 }
1290
1291 /*
1292  *      Send a SYN-ACK after having received an ACK.
1293  *      This still operates on a request_sock only, not on a big
1294  *      socket.
1295  */
1296 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1297                               struct dst_entry *dst)
1298 {
1299         const struct inet_request_sock *ireq = inet_rsk(req);
1300         int err = -1;
1301         struct sk_buff * skb;
1302
1303         /* First, grab a route. */
1304         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1305                 goto out;
1306
1307         skb = tcp_make_synack(sk, dst, req);
1308
1309         if (skb) {
1310                 struct tcphdr *th = skb->h.th;
1311
1312                 th->check = tcp_v4_check(th, skb->len,
1313                                          ireq->loc_addr,
1314                                          ireq->rmt_addr,
1315                                          csum_partial((char *)th, skb->len,
1316                                                       skb->csum));
1317
1318                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1319                                             ireq->rmt_addr,
1320                                             ireq->opt);
1321                 if (err == NET_XMIT_CN)
1322                         err = 0;
1323         }
1324
1325 out:
1326         dst_release(dst);
1327         return err;
1328 }
1329
1330 /*
1331  *      IPv4 request_sock destructor.
1332  */
1333 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1334 {
1335         if (inet_rsk(req)->opt)
1336                 kfree(inet_rsk(req)->opt);
1337 }
1338
1339 static inline void syn_flood_warning(struct sk_buff *skb)
1340 {
1341         static unsigned long warntime;
1342
1343         if (time_after(jiffies, (warntime + HZ * 60))) {
1344                 warntime = jiffies;
1345                 printk(KERN_INFO
1346                        "possible SYN flooding on port %d. Sending cookies.\n",
1347                        ntohs(skb->h.th->dest));
1348         }
1349 }
1350
1351 /*
1352  * Save and compile IPv4 options into the request_sock if needed.
1353  */
1354 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1355                                                      struct sk_buff *skb)
1356 {
1357         struct ip_options *opt = &(IPCB(skb)->opt);
1358         struct ip_options *dopt = NULL;
1359
1360         if (opt && opt->optlen) {
1361                 int opt_size = optlength(opt);
1362                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1363                 if (dopt) {
1364                         if (ip_options_echo(dopt, skb)) {
1365                                 kfree(dopt);
1366                                 dopt = NULL;
1367                         }
1368                 }
1369         }
1370         return dopt;
1371 }
1372
1373 struct request_sock_ops tcp_request_sock_ops = {
1374         .family         =       PF_INET,
1375         .obj_size       =       sizeof(struct tcp_request_sock),
1376         .rtx_syn_ack    =       tcp_v4_send_synack,
1377         .send_ack       =       tcp_v4_reqsk_send_ack,
1378         .destructor     =       tcp_v4_reqsk_destructor,
1379         .send_reset     =       tcp_v4_send_reset,
1380 };
1381
1382 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1383 {
1384         struct inet_request_sock *ireq;
1385         struct tcp_options_received tmp_opt;
1386         struct request_sock *req;
1387         __u32 saddr = skb->nh.iph->saddr;
1388         __u32 daddr = skb->nh.iph->daddr;
1389         __u32 isn = TCP_SKB_CB(skb)->when;
1390         struct dst_entry *dst = NULL;
1391 #ifdef CONFIG_SYN_COOKIES
1392         int want_cookie = 0;
1393 #else
1394 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1395 #endif
1396
1397         /* Never answer to SYNs send to broadcast or multicast */
1398         if (((struct rtable *)skb->dst)->rt_flags &
1399             (RTCF_BROADCAST | RTCF_MULTICAST))
1400                 goto drop;
1401
1402         /* TW buckets are converted to open requests without
1403          * limitations, they conserve resources and peer is
1404          * evidently real one.
1405          */
1406         if (tcp_synq_is_full(sk) && !isn) {
1407 #ifdef CONFIG_SYN_COOKIES
1408                 if (sysctl_tcp_syncookies) {
1409                         want_cookie = 1;
1410                 } else
1411 #endif
1412                 goto drop;
1413         }
1414
1415         /* Accept backlog is full. If we have already queued enough
1416          * of warm entries in syn queue, drop request. It is better than
1417          * clogging syn queue with openreqs with exponentially increasing
1418          * timeout.
1419          */
1420         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1421                 goto drop;
1422
1423         req = reqsk_alloc(&tcp_request_sock_ops);
1424         if (!req)
1425                 goto drop;
1426
1427         tcp_clear_options(&tmp_opt);
1428         tmp_opt.mss_clamp = 536;
1429         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1430
1431         tcp_parse_options(skb, &tmp_opt, 0);
1432
1433         if (want_cookie) {
1434                 tcp_clear_options(&tmp_opt);
1435                 tmp_opt.saw_tstamp = 0;
1436         }
1437
1438         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1439                 /* Some OSes (unknown ones, but I see them on web server, which
1440                  * contains information interesting only for windows'
1441                  * users) do not send their stamp in SYN. It is easy case.
1442                  * We simply do not advertise TS support.
1443                  */
1444                 tmp_opt.saw_tstamp = 0;
1445                 tmp_opt.tstamp_ok  = 0;
1446         }
1447         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1448
1449         tcp_openreq_init(req, &tmp_opt, skb);
1450
1451         ireq = inet_rsk(req);
1452         ireq->loc_addr = daddr;
1453         ireq->rmt_addr = saddr;
1454         ireq->opt = tcp_v4_save_options(sk, skb);
1455         if (!want_cookie)
1456                 TCP_ECN_create_request(req, skb->h.th);
1457
1458         if (want_cookie) {
1459 #ifdef CONFIG_SYN_COOKIES
1460                 syn_flood_warning(skb);
1461 #endif
1462                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1463         } else if (!isn) {
1464                 struct inet_peer *peer = NULL;
1465
1466                 /* VJ's idea. We save last timestamp seen
1467                  * from the destination in peer table, when entering
1468                  * state TIME-WAIT, and check against it before
1469                  * accepting new connection request.
1470                  *
1471                  * If "isn" is not zero, this request hit alive
1472                  * timewait bucket, so that all the necessary checks
1473                  * are made in the function processing timewait state.
1474                  */
1475                 if (tmp_opt.saw_tstamp &&
1476                     sysctl_tcp_tw_recycle &&
1477                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1478                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1479                     peer->v4daddr == saddr) {
1480                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1481                             (s32)(peer->tcp_ts - req->ts_recent) >
1482                                                         TCP_PAWS_WINDOW) {
1483                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1484                                 dst_release(dst);
1485                                 goto drop_and_free;
1486                         }
1487                 }
1488                 /* Kill the following clause, if you dislike this way. */
1489                 else if (!sysctl_tcp_syncookies &&
1490                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1491                           (sysctl_max_syn_backlog >> 2)) &&
1492                          (!peer || !peer->tcp_ts_stamp) &&
1493                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1494                         /* Without syncookies last quarter of
1495                          * backlog is filled with destinations,
1496                          * proven to be alive.
1497                          * It means that we continue to communicate
1498                          * to destinations, already remembered
1499                          * to the moment of synflood.
1500                          */
1501                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1502                                               "request from %u.%u."
1503                                               "%u.%u/%u\n",
1504                                               NIPQUAD(saddr),
1505                                               ntohs(skb->h.th->source)));
1506                         dst_release(dst);
1507                         goto drop_and_free;
1508                 }
1509
1510                 isn = tcp_v4_init_sequence(sk, skb);
1511         }
1512         tcp_rsk(req)->snt_isn = isn;
1513
1514         if (tcp_v4_send_synack(sk, req, dst))
1515                 goto drop_and_free;
1516
1517         if (want_cookie) {
1518                 reqsk_free(req);
1519         } else {
1520                 tcp_v4_synq_add(sk, req);
1521         }
1522         return 0;
1523
1524 drop_and_free:
1525         reqsk_free(req);
1526 drop:
1527         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1528         return 0;
1529 }
1530
1531
1532 /*
1533  * The three way handshake has completed - we got a valid synack -
1534  * now create the new socket.
1535  */
1536 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1537                                   struct request_sock *req,
1538                                   struct dst_entry *dst)
1539 {
1540         struct inet_request_sock *ireq;
1541         struct inet_sock *newinet;
1542         struct tcp_sock *newtp;
1543         struct sock *newsk;
1544
1545         if (sk_acceptq_is_full(sk))
1546                 goto exit_overflow;
1547
1548         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1549                 goto exit;
1550
1551         newsk = tcp_create_openreq_child(sk, req, skb);
1552         if (!newsk)
1553                 goto exit;
1554
1555         sk_setup_caps(newsk, dst);
1556
1557         newtp                 = tcp_sk(newsk);
1558         newinet               = inet_sk(newsk);
1559         ireq                  = inet_rsk(req);
1560         newinet->daddr        = ireq->rmt_addr;
1561         newinet->rcv_saddr    = ireq->loc_addr;
1562         newinet->saddr        = ireq->loc_addr;
1563         newinet->opt          = ireq->opt;
1564         ireq->opt             = NULL;
1565         newinet->mc_index     = tcp_v4_iif(skb);
1566         newinet->mc_ttl       = skb->nh.iph->ttl;
1567         newtp->ext_header_len = 0;
1568         if (newinet->opt)
1569                 newtp->ext_header_len = newinet->opt->optlen;
1570         newinet->id = newtp->write_seq ^ jiffies;
1571
1572         tcp_sync_mss(newsk, dst_mtu(dst));
1573         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1574         tcp_initialize_rcv_mss(newsk);
1575
1576         __tcp_v4_hash(newsk, 0);
1577         __tcp_inherit_port(sk, newsk);
1578
1579         return newsk;
1580
1581 exit_overflow:
1582         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1583 exit:
1584         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1585         dst_release(dst);
1586         return NULL;
1587 }
1588
1589 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1590 {
1591         struct tcphdr *th = skb->h.th;
1592         struct iphdr *iph = skb->nh.iph;
1593         struct tcp_sock *tp = tcp_sk(sk);
1594         struct sock *nsk;
1595         struct request_sock **prev;
1596         /* Find possible connection requests. */
1597         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1598                                                      iph->saddr, iph->daddr);
1599         if (req)
1600                 return tcp_check_req(sk, skb, req, prev);
1601
1602         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1603                                           th->source,
1604                                           skb->nh.iph->daddr,
1605                                           ntohs(th->dest),
1606                                           tcp_v4_iif(skb));
1607
1608         if (nsk) {
1609                 if (nsk->sk_state != TCP_TIME_WAIT) {
1610                         bh_lock_sock(nsk);
1611                         return nsk;
1612                 }
1613                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1614                 return NULL;
1615         }
1616
1617 #ifdef CONFIG_SYN_COOKIES
1618         if (!th->rst && !th->syn && th->ack)
1619                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1620 #endif
1621         return sk;
1622 }
1623
1624 static int tcp_v4_checksum_init(struct sk_buff *skb)
1625 {
1626         if (skb->ip_summed == CHECKSUM_HW) {
1627                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1628                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1629                                   skb->nh.iph->daddr, skb->csum))
1630                         return 0;
1631
1632                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1633                 skb->ip_summed = CHECKSUM_NONE;
1634         }
1635         if (skb->len <= 76) {
1636                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1637                                  skb->nh.iph->daddr,
1638                                  skb_checksum(skb, 0, skb->len, 0)))
1639                         return -1;
1640                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1641         } else {
1642                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1643                                           skb->nh.iph->saddr,
1644                                           skb->nh.iph->daddr, 0);
1645         }
1646         return 0;
1647 }
1648
1649
1650 /* The socket must have it's spinlock held when we get
1651  * here.
1652  *
1653  * We have a potential double-lock case here, so even when
1654  * doing backlog processing we use the BH locking scheme.
1655  * This is because we cannot sleep with the original spinlock
1656  * held.
1657  */
1658 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1659 {
1660         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1661                 TCP_CHECK_TIMER(sk);
1662                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1663                         goto reset;
1664                 TCP_CHECK_TIMER(sk);
1665                 return 0;
1666         }
1667
1668         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1669                 goto csum_err;
1670
1671         if (sk->sk_state == TCP_LISTEN) {
1672                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1673                 if (!nsk)
1674                         goto discard;
1675
1676                 if (nsk != sk) {
1677                         if (tcp_child_process(sk, nsk, skb))
1678                                 goto reset;
1679                         return 0;
1680                 }
1681         }
1682
1683         TCP_CHECK_TIMER(sk);
1684         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1685                 goto reset;
1686         TCP_CHECK_TIMER(sk);
1687         return 0;
1688
1689 reset:
1690         tcp_v4_send_reset(skb);
1691 discard:
1692         kfree_skb(skb);
1693         /* Be careful here. If this function gets more complicated and
1694          * gcc suffers from register pressure on the x86, sk (in %ebx)
1695          * might be destroyed here. This current version compiles correctly,
1696          * but you have been warned.
1697          */
1698         return 0;
1699
1700 csum_err:
1701         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1702         goto discard;
1703 }
1704
1705 /*
1706  *      From tcp_input.c
1707  */
1708
1709 int tcp_v4_rcv(struct sk_buff *skb)
1710 {
1711         struct tcphdr *th;
1712         struct sock *sk;
1713         int ret;
1714
1715         if (skb->pkt_type != PACKET_HOST)
1716                 goto discard_it;
1717
1718         /* Count it even if it's bad */
1719         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1720
1721         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1722                 goto discard_it;
1723
1724         th = skb->h.th;
1725
1726         if (th->doff < sizeof(struct tcphdr) / 4)
1727                 goto bad_packet;
1728         if (!pskb_may_pull(skb, th->doff * 4))
1729                 goto discard_it;
1730
1731         /* An explanation is required here, I think.
1732          * Packet length and doff are validated by header prediction,
1733          * provided case of th->doff==0 is elimineted.
1734          * So, we defer the checks. */
1735         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1736              tcp_v4_checksum_init(skb) < 0))
1737                 goto bad_packet;
1738
1739         th = skb->h.th;
1740         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1741         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1742                                     skb->len - th->doff * 4);
1743         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1744         TCP_SKB_CB(skb)->when    = 0;
1745         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1746         TCP_SKB_CB(skb)->sacked  = 0;
1747
1748         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1749                              skb->nh.iph->daddr, ntohs(th->dest),
1750                              tcp_v4_iif(skb));
1751
1752         if (!sk)
1753                 goto no_tcp_socket;
1754
1755 process:
1756         if (sk->sk_state == TCP_TIME_WAIT)
1757                 goto do_time_wait;
1758
1759         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1760                 goto discard_and_relse;
1761
1762         if (sk_filter(sk, skb, 0))
1763                 goto discard_and_relse;
1764
1765         skb->dev = NULL;
1766
1767         bh_lock_sock(sk);
1768         ret = 0;
1769         if (!sock_owned_by_user(sk)) {
1770                 if (!tcp_prequeue(sk, skb))
1771                         ret = tcp_v4_do_rcv(sk, skb);
1772         } else
1773                 sk_add_backlog(sk, skb);
1774         bh_unlock_sock(sk);
1775
1776         sock_put(sk);
1777
1778         return ret;
1779
1780 no_tcp_socket:
1781         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1782                 goto discard_it;
1783
1784         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1785 bad_packet:
1786                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1787         } else {
1788                 tcp_v4_send_reset(skb);
1789         }
1790
1791 discard_it:
1792         /* Discard frame. */
1793         kfree_skb(skb);
1794         return 0;
1795
1796 discard_and_relse:
1797         sock_put(sk);
1798         goto discard_it;
1799
1800 do_time_wait:
1801         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1802                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1803                 goto discard_it;
1804         }
1805
1806         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1807                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1808                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1809                 goto discard_it;
1810         }
1811         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1812                                            skb, th, skb->len)) {
1813         case TCP_TW_SYN: {
1814                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1815                                                           ntohs(th->dest),
1816                                                           tcp_v4_iif(skb));
1817                 if (sk2) {
1818                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1819                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1820                         sk = sk2;
1821                         goto process;
1822                 }
1823                 /* Fall through to ACK */
1824         }
1825         case TCP_TW_ACK:
1826                 tcp_v4_timewait_ack(sk, skb);
1827                 break;
1828         case TCP_TW_RST:
1829                 goto no_tcp_socket;
1830         case TCP_TW_SUCCESS:;
1831         }
1832         goto discard_it;
1833 }
1834
1835 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1836 {
1837         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1838         struct inet_sock *inet = inet_sk(sk);
1839
1840         sin->sin_family         = AF_INET;
1841         sin->sin_addr.s_addr    = inet->daddr;
1842         sin->sin_port           = inet->dport;
1843 }
1844
1845 /* VJ's idea. Save last timestamp seen from this destination
1846  * and hold it at least for normal timewait interval to use for duplicate
1847  * segment detection in subsequent connections, before they enter synchronized
1848  * state.
1849  */
1850
1851 int tcp_v4_remember_stamp(struct sock *sk)
1852 {
1853         struct inet_sock *inet = inet_sk(sk);
1854         struct tcp_sock *tp = tcp_sk(sk);
1855         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1856         struct inet_peer *peer = NULL;
1857         int release_it = 0;
1858
1859         if (!rt || rt->rt_dst != inet->daddr) {
1860                 peer = inet_getpeer(inet->daddr, 1);
1861                 release_it = 1;
1862         } else {
1863                 if (!rt->peer)
1864                         rt_bind_peer(rt, 1);
1865                 peer = rt->peer;
1866         }
1867
1868         if (peer) {
1869                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1870                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1871                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1872                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1873                         peer->tcp_ts = tp->rx_opt.ts_recent;
1874                 }
1875                 if (release_it)
1876                         inet_putpeer(peer);
1877                 return 1;
1878         }
1879
1880         return 0;
1881 }
1882
1883 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1884 {
1885         struct inet_peer *peer = NULL;
1886
1887         peer = inet_getpeer(tw->tw_daddr, 1);
1888
1889         if (peer) {
1890                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1891                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1892                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1893                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1894                         peer->tcp_ts = tw->tw_ts_recent;
1895                 }
1896                 inet_putpeer(peer);
1897                 return 1;
1898         }
1899
1900         return 0;
1901 }
1902
1903 struct tcp_func ipv4_specific = {
1904         .queue_xmit     =       ip_queue_xmit,
1905         .send_check     =       tcp_v4_send_check,
1906         .rebuild_header =       inet_sk_rebuild_header,
1907         .conn_request   =       tcp_v4_conn_request,
1908         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1909         .remember_stamp =       tcp_v4_remember_stamp,
1910         .net_header_len =       sizeof(struct iphdr),
1911         .setsockopt     =       ip_setsockopt,
1912         .getsockopt     =       ip_getsockopt,
1913         .addr2sockaddr  =       v4_addr2sockaddr,
1914         .sockaddr_len   =       sizeof(struct sockaddr_in),
1915 };
1916
1917 /* NOTE: A lot of things set to zero explicitly by call to
1918  *       sk_alloc() so need not be done here.
1919  */
1920 static int tcp_v4_init_sock(struct sock *sk)
1921 {
1922         struct tcp_sock *tp = tcp_sk(sk);
1923
1924         skb_queue_head_init(&tp->out_of_order_queue);
1925         tcp_init_xmit_timers(sk);
1926         tcp_prequeue_init(tp);
1927
1928         tp->rto  = TCP_TIMEOUT_INIT;
1929         tp->mdev = TCP_TIMEOUT_INIT;
1930
1931         /* So many TCP implementations out there (incorrectly) count the
1932          * initial SYN frame in their delayed-ACK and congestion control
1933          * algorithms that we must have the following bandaid to talk
1934          * efficiently to them.  -DaveM
1935          */
1936         tp->snd_cwnd = 2;
1937
1938         /* See draft-stevens-tcpca-spec-01 for discussion of the
1939          * initialization of these values.
1940          */
1941         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1942         tp->snd_cwnd_clamp = ~0;
1943         tp->mss_cache = 536;
1944
1945         tp->reordering = sysctl_tcp_reordering;
1946         tp->ca_ops = &tcp_init_congestion_ops;
1947
1948         sk->sk_state = TCP_CLOSE;
1949
1950         sk->sk_write_space = sk_stream_write_space;
1951         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1952
1953         tp->af_specific = &ipv4_specific;
1954
1955         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1956         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1957
1958         atomic_inc(&tcp_sockets_allocated);
1959
1960         return 0;
1961 }
1962
1963 int tcp_v4_destroy_sock(struct sock *sk)
1964 {
1965         struct tcp_sock *tp = tcp_sk(sk);
1966
1967         tcp_clear_xmit_timers(sk);
1968
1969         tcp_cleanup_congestion_control(tp);
1970
1971         /* Cleanup up the write buffer. */
1972         sk_stream_writequeue_purge(sk);
1973
1974         /* Cleans up our, hopefully empty, out_of_order_queue. */
1975         __skb_queue_purge(&tp->out_of_order_queue);
1976
1977         /* Clean prequeue, it must be empty really */
1978         __skb_queue_purge(&tp->ucopy.prequeue);
1979
1980         /* Clean up a referenced TCP bind bucket. */
1981         if (tp->bind_hash)
1982                 tcp_put_port(sk);
1983
1984         /*
1985          * If sendmsg cached page exists, toss it.
1986          */
1987         if (sk->sk_sndmsg_page) {
1988                 __free_page(sk->sk_sndmsg_page);
1989                 sk->sk_sndmsg_page = NULL;
1990         }
1991
1992         atomic_dec(&tcp_sockets_allocated);
1993
1994         return 0;
1995 }
1996
1997 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1998
1999 #ifdef CONFIG_PROC_FS
2000 /* Proc filesystem TCP sock list dumping. */
2001
2002 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
2003 {
2004         return hlist_empty(head) ? NULL :
2005                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
2006 }
2007
2008 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
2009 {
2010         return tw->tw_node.next ?
2011                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2012 }
2013
2014 static void *listening_get_next(struct seq_file *seq, void *cur)
2015 {
2016         struct tcp_sock *tp;
2017         struct hlist_node *node;
2018         struct sock *sk = cur;
2019         struct tcp_iter_state* st = seq->private;
2020
2021         if (!sk) {
2022                 st->bucket = 0;
2023                 sk = sk_head(&tcp_listening_hash[0]);
2024                 goto get_sk;
2025         }
2026
2027         ++st->num;
2028
2029         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2030                 struct request_sock *req = cur;
2031
2032                 tp = tcp_sk(st->syn_wait_sk);
2033                 req = req->dl_next;
2034                 while (1) {
2035                         while (req) {
2036                                 if (req->rsk_ops->family == st->family) {
2037                                         cur = req;
2038                                         goto out;
2039                                 }
2040                                 req = req->dl_next;
2041                         }
2042                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2043                                 break;
2044 get_req:
2045                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2046                 }
2047                 sk        = sk_next(st->syn_wait_sk);
2048                 st->state = TCP_SEQ_STATE_LISTENING;
2049                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2050         } else {
2051                 tp = tcp_sk(sk);
2052                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2053                 if (reqsk_queue_len(&tp->accept_queue))
2054                         goto start_req;
2055                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2056                 sk = sk_next(sk);
2057         }
2058 get_sk:
2059         sk_for_each_from(sk, node) {
2060                 if (sk->sk_family == st->family) {
2061                         cur = sk;
2062                         goto out;
2063                 }
2064                 tp = tcp_sk(sk);
2065                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2066                 if (reqsk_queue_len(&tp->accept_queue)) {
2067 start_req:
2068                         st->uid         = sock_i_uid(sk);
2069                         st->syn_wait_sk = sk;
2070                         st->state       = TCP_SEQ_STATE_OPENREQ;
2071                         st->sbucket     = 0;
2072                         goto get_req;
2073                 }
2074                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2075         }
2076         if (++st->bucket < TCP_LHTABLE_SIZE) {
2077                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2078                 goto get_sk;
2079         }
2080         cur = NULL;
2081 out:
2082         return cur;
2083 }
2084
2085 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2086 {
2087         void *rc = listening_get_next(seq, NULL);
2088
2089         while (rc && *pos) {
2090                 rc = listening_get_next(seq, rc);
2091                 --*pos;
2092         }
2093         return rc;
2094 }
2095
2096 static void *established_get_first(struct seq_file *seq)
2097 {
2098         struct tcp_iter_state* st = seq->private;
2099         void *rc = NULL;
2100
2101         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2102                 struct sock *sk;
2103                 struct hlist_node *node;
2104                 struct tcp_tw_bucket *tw;
2105
2106                 /* We can reschedule _before_ having picked the target: */
2107                 cond_resched_softirq();
2108
2109                 read_lock(&tcp_ehash[st->bucket].lock);
2110                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2111                         if (sk->sk_family != st->family) {
2112                                 continue;
2113                         }
2114                         rc = sk;
2115                         goto out;
2116                 }
2117                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2118                 tw_for_each(tw, node,
2119                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2120                         if (tw->tw_family != st->family) {
2121                                 continue;
2122                         }
2123                         rc = tw;
2124                         goto out;
2125                 }
2126                 read_unlock(&tcp_ehash[st->bucket].lock);
2127                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2128         }
2129 out:
2130         return rc;
2131 }
2132
2133 static void *established_get_next(struct seq_file *seq, void *cur)
2134 {
2135         struct sock *sk = cur;
2136         struct tcp_tw_bucket *tw;
2137         struct hlist_node *node;
2138         struct tcp_iter_state* st = seq->private;
2139
2140         ++st->num;
2141
2142         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2143                 tw = cur;
2144                 tw = tw_next(tw);
2145 get_tw:
2146                 while (tw && tw->tw_family != st->family) {
2147                         tw = tw_next(tw);
2148                 }
2149                 if (tw) {
2150                         cur = tw;
2151                         goto out;
2152                 }
2153                 read_unlock(&tcp_ehash[st->bucket].lock);
2154                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2155
2156                 /* We can reschedule between buckets: */
2157                 cond_resched_softirq();
2158
2159                 if (++st->bucket < tcp_ehash_size) {
2160                         read_lock(&tcp_ehash[st->bucket].lock);
2161                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2162                 } else {
2163                         cur = NULL;
2164                         goto out;
2165                 }
2166         } else
2167                 sk = sk_next(sk);
2168
2169         sk_for_each_from(sk, node) {
2170                 if (sk->sk_family == st->family)
2171                         goto found;
2172         }
2173
2174         st->state = TCP_SEQ_STATE_TIME_WAIT;
2175         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2176         goto get_tw;
2177 found:
2178         cur = sk;
2179 out:
2180         return cur;
2181 }
2182
2183 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2184 {
2185         void *rc = established_get_first(seq);
2186
2187         while (rc && pos) {
2188                 rc = established_get_next(seq, rc);
2189                 --pos;
2190         }               
2191         return rc;
2192 }
2193
2194 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2195 {
2196         void *rc;
2197         struct tcp_iter_state* st = seq->private;
2198
2199         tcp_listen_lock();
2200         st->state = TCP_SEQ_STATE_LISTENING;
2201         rc        = listening_get_idx(seq, &pos);
2202
2203         if (!rc) {
2204                 tcp_listen_unlock();
2205                 local_bh_disable();
2206                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2207                 rc        = established_get_idx(seq, pos);
2208         }
2209
2210         return rc;
2211 }
2212
2213 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2214 {
2215         struct tcp_iter_state* st = seq->private;
2216         st->state = TCP_SEQ_STATE_LISTENING;
2217         st->num = 0;
2218         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2219 }
2220
2221 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2222 {
2223         void *rc = NULL;
2224         struct tcp_iter_state* st;
2225
2226         if (v == SEQ_START_TOKEN) {
2227                 rc = tcp_get_idx(seq, 0);
2228                 goto out;
2229         }
2230         st = seq->private;
2231
2232         switch (st->state) {
2233         case TCP_SEQ_STATE_OPENREQ:
2234         case TCP_SEQ_STATE_LISTENING:
2235                 rc = listening_get_next(seq, v);
2236                 if (!rc) {
2237                         tcp_listen_unlock();
2238                         local_bh_disable();
2239                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2240                         rc        = established_get_first(seq);
2241                 }
2242                 break;
2243         case TCP_SEQ_STATE_ESTABLISHED:
2244         case TCP_SEQ_STATE_TIME_WAIT:
2245                 rc = established_get_next(seq, v);
2246                 break;
2247         }
2248 out:
2249         ++*pos;
2250         return rc;
2251 }
2252
2253 static void tcp_seq_stop(struct seq_file *seq, void *v)
2254 {
2255         struct tcp_iter_state* st = seq->private;
2256
2257         switch (st->state) {
2258         case TCP_SEQ_STATE_OPENREQ:
2259                 if (v) {
2260                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2261                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2262                 }
2263         case TCP_SEQ_STATE_LISTENING:
2264                 if (v != SEQ_START_TOKEN)
2265                         tcp_listen_unlock();
2266                 break;
2267         case TCP_SEQ_STATE_TIME_WAIT:
2268         case TCP_SEQ_STATE_ESTABLISHED:
2269                 if (v)
2270                         read_unlock(&tcp_ehash[st->bucket].lock);
2271                 local_bh_enable();
2272                 break;
2273         }
2274 }
2275
2276 static int tcp_seq_open(struct inode *inode, struct file *file)
2277 {
2278         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2279         struct seq_file *seq;
2280         struct tcp_iter_state *s;
2281         int rc;
2282
2283         if (unlikely(afinfo == NULL))
2284                 return -EINVAL;
2285
2286         s = kmalloc(sizeof(*s), GFP_KERNEL);
2287         if (!s)
2288                 return -ENOMEM;
2289         memset(s, 0, sizeof(*s));
2290         s->family               = afinfo->family;
2291         s->seq_ops.start        = tcp_seq_start;
2292         s->seq_ops.next         = tcp_seq_next;
2293         s->seq_ops.show         = afinfo->seq_show;
2294         s->seq_ops.stop         = tcp_seq_stop;
2295
2296         rc = seq_open(file, &s->seq_ops);
2297         if (rc)
2298                 goto out_kfree;
2299         seq          = file->private_data;
2300         seq->private = s;
2301 out:
2302         return rc;
2303 out_kfree:
2304         kfree(s);
2305         goto out;
2306 }
2307
2308 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2309 {
2310         int rc = 0;
2311         struct proc_dir_entry *p;
2312
2313         if (!afinfo)
2314                 return -EINVAL;
2315         afinfo->seq_fops->owner         = afinfo->owner;
2316         afinfo->seq_fops->open          = tcp_seq_open;
2317         afinfo->seq_fops->read          = seq_read;
2318         afinfo->seq_fops->llseek        = seq_lseek;
2319         afinfo->seq_fops->release       = seq_release_private;
2320         
2321         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2322         if (p)
2323                 p->data = afinfo;
2324         else
2325                 rc = -ENOMEM;
2326         return rc;
2327 }
2328
2329 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2330 {
2331         if (!afinfo)
2332                 return;
2333         proc_net_remove(afinfo->name);
2334         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2335 }
2336
2337 static void get_openreq4(struct sock *sk, struct request_sock *req,
2338                          char *tmpbuf, int i, int uid)
2339 {
2340         const struct inet_request_sock *ireq = inet_rsk(req);
2341         int ttd = req->expires - jiffies;
2342
2343         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2344                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2345                 i,
2346                 ireq->loc_addr,
2347                 ntohs(inet_sk(sk)->sport),
2348                 ireq->rmt_addr,
2349                 ntohs(ireq->rmt_port),
2350                 TCP_SYN_RECV,
2351                 0, 0, /* could print option size, but that is af dependent. */
2352                 1,    /* timers active (only the expire timer) */
2353                 jiffies_to_clock_t(ttd),
2354                 req->retrans,
2355                 uid,
2356                 0,  /* non standard timer */
2357                 0, /* open_requests have no inode */
2358                 atomic_read(&sk->sk_refcnt),
2359                 req);
2360 }
2361
2362 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2363 {
2364         int timer_active;
2365         unsigned long timer_expires;
2366         struct tcp_sock *tp = tcp_sk(sp);
2367         struct inet_sock *inet = inet_sk(sp);
2368         unsigned int dest = inet->daddr;
2369         unsigned int src = inet->rcv_saddr;
2370         __u16 destp = ntohs(inet->dport);
2371         __u16 srcp = ntohs(inet->sport);
2372
2373         if (tp->pending == TCP_TIME_RETRANS) {
2374                 timer_active    = 1;
2375                 timer_expires   = tp->timeout;
2376         } else if (tp->pending == TCP_TIME_PROBE0) {
2377                 timer_active    = 4;
2378                 timer_expires   = tp->timeout;
2379         } else if (timer_pending(&sp->sk_timer)) {
2380                 timer_active    = 2;
2381                 timer_expires   = sp->sk_timer.expires;
2382         } else {
2383                 timer_active    = 0;
2384                 timer_expires = jiffies;
2385         }
2386
2387         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2388                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2389                 i, src, srcp, dest, destp, sp->sk_state,
2390                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2391                 timer_active,
2392                 jiffies_to_clock_t(timer_expires - jiffies),
2393                 tp->retransmits,
2394                 sock_i_uid(sp),
2395                 tp->probes_out,
2396                 sock_i_ino(sp),
2397                 atomic_read(&sp->sk_refcnt), sp,
2398                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2399                 tp->snd_cwnd,
2400                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2401 }
2402
2403 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2404 {
2405         unsigned int dest, src;
2406         __u16 destp, srcp;
2407         int ttd = tw->tw_ttd - jiffies;
2408
2409         if (ttd < 0)
2410                 ttd = 0;
2411
2412         dest  = tw->tw_daddr;
2413         src   = tw->tw_rcv_saddr;
2414         destp = ntohs(tw->tw_dport);
2415         srcp  = ntohs(tw->tw_sport);
2416
2417         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2418                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2419                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2420                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2421                 atomic_read(&tw->tw_refcnt), tw);
2422 }
2423
2424 #define TMPSZ 150
2425
2426 static int tcp4_seq_show(struct seq_file *seq, void *v)
2427 {
2428         struct tcp_iter_state* st;
2429         char tmpbuf[TMPSZ + 1];
2430
2431         if (v == SEQ_START_TOKEN) {
2432                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2433                            "  sl  local_address rem_address   st tx_queue "
2434                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2435                            "inode");
2436                 goto out;
2437         }
2438         st = seq->private;
2439
2440         switch (st->state) {
2441         case TCP_SEQ_STATE_LISTENING:
2442         case TCP_SEQ_STATE_ESTABLISHED:
2443                 get_tcp4_sock(v, tmpbuf, st->num);
2444                 break;
2445         case TCP_SEQ_STATE_OPENREQ:
2446                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2447                 break;
2448         case TCP_SEQ_STATE_TIME_WAIT:
2449                 get_timewait4_sock(v, tmpbuf, st->num);
2450                 break;
2451         }
2452         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2453 out:
2454         return 0;
2455 }
2456
2457 static struct file_operations tcp4_seq_fops;
2458 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2459         .owner          = THIS_MODULE,
2460         .name           = "tcp",
2461         .family         = AF_INET,
2462         .seq_show       = tcp4_seq_show,
2463         .seq_fops       = &tcp4_seq_fops,
2464 };
2465
2466 int __init tcp4_proc_init(void)
2467 {
2468         return tcp_proc_register(&tcp4_seq_afinfo);
2469 }
2470
2471 void tcp4_proc_exit(void)
2472 {
2473         tcp_proc_unregister(&tcp4_seq_afinfo);
2474 }
2475 #endif /* CONFIG_PROC_FS */
2476
2477 struct proto tcp_prot = {
2478         .name                   = "TCP",
2479         .owner                  = THIS_MODULE,
2480         .close                  = tcp_close,
2481         .connect                = tcp_v4_connect,
2482         .disconnect             = tcp_disconnect,
2483         .accept                 = tcp_accept,
2484         .ioctl                  = tcp_ioctl,
2485         .init                   = tcp_v4_init_sock,
2486         .destroy                = tcp_v4_destroy_sock,
2487         .shutdown               = tcp_shutdown,
2488         .setsockopt             = tcp_setsockopt,
2489         .getsockopt             = tcp_getsockopt,
2490         .sendmsg                = tcp_sendmsg,
2491         .recvmsg                = tcp_recvmsg,
2492         .backlog_rcv            = tcp_v4_do_rcv,
2493         .hash                   = tcp_v4_hash,
2494         .unhash                 = tcp_unhash,
2495         .get_port               = tcp_v4_get_port,
2496         .enter_memory_pressure  = tcp_enter_memory_pressure,
2497         .sockets_allocated      = &tcp_sockets_allocated,
2498         .memory_allocated       = &tcp_memory_allocated,
2499         .memory_pressure        = &tcp_memory_pressure,
2500         .sysctl_mem             = sysctl_tcp_mem,
2501         .sysctl_wmem            = sysctl_tcp_wmem,
2502         .sysctl_rmem            = sysctl_tcp_rmem,
2503         .max_header             = MAX_TCP_HEADER,
2504         .obj_size               = sizeof(struct tcp_sock),
2505         .rsk_prot               = &tcp_request_sock_ops,
2506 };
2507
2508
2509
2510 void __init tcp_v4_init(struct net_proto_family *ops)
2511 {
2512         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2513         if (err < 0)
2514                 panic("Failed to create the TCP control socket.\n");
2515         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2516         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2517
2518         /* Unhash it so that IP input processing does not even
2519          * see it, we do not wish this socket to see incoming
2520          * packets.
2521          */
2522         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2523 }
2524
2525 EXPORT_SYMBOL(ipv4_specific);
2526 EXPORT_SYMBOL(tcp_bind_hash);
2527 EXPORT_SYMBOL(tcp_bucket_create);
2528 EXPORT_SYMBOL(tcp_hashinfo);
2529 EXPORT_SYMBOL(tcp_inherit_port);
2530 EXPORT_SYMBOL(tcp_listen_wlock);
2531 EXPORT_SYMBOL(tcp_port_rover);
2532 EXPORT_SYMBOL(tcp_prot);
2533 EXPORT_SYMBOL(tcp_put_port);
2534 EXPORT_SYMBOL(tcp_unhash);
2535 EXPORT_SYMBOL(tcp_v4_conn_request);
2536 EXPORT_SYMBOL(tcp_v4_connect);
2537 EXPORT_SYMBOL(tcp_v4_do_rcv);
2538 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2539 EXPORT_SYMBOL(tcp_v4_send_check);
2540 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2541
2542 #ifdef CONFIG_PROC_FS
2543 EXPORT_SYMBOL(tcp_proc_register);
2544 EXPORT_SYMBOL(tcp_proc_unregister);
2545 #endif
2546 EXPORT_SYMBOL(sysctl_local_port_range);
2547 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2548 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2549