/spare/repo/netdev-2.6 branch 'uli-tulip'
[linux-2.6] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300
301 EXPORT_SYMBOL(tcp_memory_pressure);
302
303 void tcp_enter_memory_pressure(void)
304 {
305         if (!tcp_memory_pressure) {
306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307                 tcp_memory_pressure = 1;
308         }
309 }
310
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317                                                poll_table *wait)
318 {
319         return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321
322 /*
323  *      Wait for a TCP event.
324  *
325  *      Note that we don't need to lock the socket, as the upper poll layers
326  *      take care of normal races (between the test and the event) and we don't
327  *      go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331         unsigned int mask;
332         struct sock *sk = sock->sk;
333         struct tcp_sock *tp = tcp_sk(sk);
334
335         poll_wait(file, sk->sk_sleep, wait);
336         if (sk->sk_state == TCP_LISTEN)
337                 return tcp_listen_poll(sk, wait);
338
339         /* Socket is not locked. We are protected from async events
340            by poll logic and correct handling of state changes
341            made by another threads is impossible in any case.
342          */
343
344         mask = 0;
345         if (sk->sk_err)
346                 mask = POLLERR;
347
348         /*
349          * POLLHUP is certainly not done right. But poll() doesn't
350          * have a notion of HUP in just one direction, and for a
351          * socket the read side is more interesting.
352          *
353          * Some poll() documentation says that POLLHUP is incompatible
354          * with the POLLOUT/POLLWR flags, so somebody should check this
355          * all. But careful, it tends to be safer to return too many
356          * bits than too few, and you can easily break real applications
357          * if you don't tell them that something has hung up!
358          *
359          * Check-me.
360          *
361          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362          * our fs/select.c). It means that after we received EOF,
363          * poll always returns immediately, making impossible poll() on write()
364          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365          * if and only if shutdown has been made in both directions.
366          * Actually, it is interesting to look how Solaris and DUX
367          * solve this dilemma. I would prefer, if PULLHUP were maskable,
368          * then we could set it on SND_SHUTDOWN. BTW examples given
369          * in Stevens' books assume exactly this behaviour, it explains
370          * why PULLHUP is incompatible with POLLOUT.    --ANK
371          *
372          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373          * blocking on fresh not-connected or disconnected socket. --ANK
374          */
375         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376                 mask |= POLLHUP;
377         if (sk->sk_shutdown & RCV_SHUTDOWN)
378                 mask |= POLLIN | POLLRDNORM;
379
380         /* Connected? */
381         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382                 /* Potential race condition. If read of tp below will
383                  * escape above sk->sk_state, we can be illegally awaken
384                  * in SYN_* states. */
385                 if ((tp->rcv_nxt != tp->copied_seq) &&
386                     (tp->urg_seq != tp->copied_seq ||
387                      tp->rcv_nxt != tp->copied_seq + 1 ||
388                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389                         mask |= POLLIN | POLLRDNORM;
390
391                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393                                 mask |= POLLOUT | POLLWRNORM;
394                         } else {  /* send SIGIO later */
395                                 set_bit(SOCK_ASYNC_NOSPACE,
396                                         &sk->sk_socket->flags);
397                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399                                 /* Race breaker. If space is freed after
400                                  * wspace test but before the flags are set,
401                                  * IO signal will be lost.
402                                  */
403                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404                                         mask |= POLLOUT | POLLWRNORM;
405                         }
406                 }
407
408                 if (tp->urg_data & TCP_URG_VALID)
409                         mask |= POLLPRI;
410         }
411         return mask;
412 }
413
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416         struct tcp_sock *tp = tcp_sk(sk);
417         int answ;
418
419         switch (cmd) {
420         case SIOCINQ:
421                 if (sk->sk_state == TCP_LISTEN)
422                         return -EINVAL;
423
424                 lock_sock(sk);
425                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426                         answ = 0;
427                 else if (sock_flag(sk, SOCK_URGINLINE) ||
428                          !tp->urg_data ||
429                          before(tp->urg_seq, tp->copied_seq) ||
430                          !before(tp->urg_seq, tp->rcv_nxt)) {
431                         answ = tp->rcv_nxt - tp->copied_seq;
432
433                         /* Subtract 1, if FIN is in queue. */
434                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435                                 answ -=
436                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437                 } else
438                         answ = tp->urg_seq - tp->copied_seq;
439                 release_sock(sk);
440                 break;
441         case SIOCATMARK:
442                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443                 break;
444         case SIOCOUTQ:
445                 if (sk->sk_state == TCP_LISTEN)
446                         return -EINVAL;
447
448                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449                         answ = 0;
450                 else
451                         answ = tp->write_seq - tp->snd_una;
452                 break;
453         default:
454                 return -ENOIOCTLCMD;
455         };
456
457         return put_user(answ, (int __user *)arg);
458 }
459
460
461 int tcp_listen_start(struct sock *sk)
462 {
463         struct inet_sock *inet = inet_sk(sk);
464         struct tcp_sock *tp = tcp_sk(sk);
465         int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467         if (rc != 0)
468                 return rc;
469
470         sk->sk_max_ack_backlog = 0;
471         sk->sk_ack_backlog = 0;
472         tcp_delack_init(tp);
473
474         /* There is race window here: we announce ourselves listening,
475          * but this transition is still not validated by get_port().
476          * It is OK, because this socket enters to hash table only
477          * after validation is complete.
478          */
479         sk->sk_state = TCP_LISTEN;
480         if (!sk->sk_prot->get_port(sk, inet->num)) {
481                 inet->sport = htons(inet->num);
482
483                 sk_dst_reset(sk);
484                 sk->sk_prot->hash(sk);
485
486                 return 0;
487         }
488
489         sk->sk_state = TCP_CLOSE;
490         reqsk_queue_destroy(&tp->accept_queue);
491         return -EADDRINUSE;
492 }
493
494 /*
495  *      This routine closes sockets which have been at least partially
496  *      opened, but not yet accepted.
497  */
498
499 static void tcp_listen_stop (struct sock *sk)
500 {
501         struct tcp_sock *tp = tcp_sk(sk);
502         struct listen_sock *lopt;
503         struct request_sock *acc_req;
504         struct request_sock *req;
505         int i;
506
507         tcp_delete_keepalive_timer(sk);
508
509         /* make all the listen_opt local to us */
510         lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511         acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513         if (lopt->qlen) {
514                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515                         while ((req = lopt->syn_table[i]) != NULL) {
516                                 lopt->syn_table[i] = req->dl_next;
517                                 lopt->qlen--;
518                                 reqsk_free(req);
519
520                 /* Following specs, it would be better either to send FIN
521                  * (and enter FIN-WAIT-1, it is normal close)
522                  * or to send active reset (abort).
523                  * Certainly, it is pretty dangerous while synflood, but it is
524                  * bad justification for our negligence 8)
525                  * To be honest, we are not able to make either
526                  * of the variants now.                 --ANK
527                  */
528                         }
529                 }
530         }
531         BUG_TRAP(!lopt->qlen);
532
533         kfree(lopt);
534
535         while ((req = acc_req) != NULL) {
536                 struct sock *child = req->sk;
537
538                 acc_req = req->dl_next;
539
540                 local_bh_disable();
541                 bh_lock_sock(child);
542                 BUG_TRAP(!sock_owned_by_user(child));
543                 sock_hold(child);
544
545                 tcp_disconnect(child, O_NONBLOCK);
546
547                 sock_orphan(child);
548
549                 atomic_inc(&tcp_orphan_count);
550
551                 tcp_destroy_sock(child);
552
553                 bh_unlock_sock(child);
554                 local_bh_enable();
555                 sock_put(child);
556
557                 sk_acceptq_removed(sk);
558                 __reqsk_free(req);
559         }
560         BUG_TRAP(!sk->sk_ack_backlog);
561 }
562
563 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564 {
565         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566         tp->pushed_seq = tp->write_seq;
567 }
568
569 static inline int forced_push(struct tcp_sock *tp)
570 {
571         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572 }
573
574 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575                               struct sk_buff *skb)
576 {
577         skb->csum = 0;
578         TCP_SKB_CB(skb)->seq = tp->write_seq;
579         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581         TCP_SKB_CB(skb)->sacked = 0;
582         skb_header_release(skb);
583         __skb_queue_tail(&sk->sk_write_queue, skb);
584         sk_charge_skb(sk, skb);
585         if (!sk->sk_send_head)
586                 sk->sk_send_head = skb;
587         if (tp->nonagle & TCP_NAGLE_PUSH)
588                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
589 }
590
591 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592                                 struct sk_buff *skb)
593 {
594         if (flags & MSG_OOB) {
595                 tp->urg_mode = 1;
596                 tp->snd_up = tp->write_seq;
597                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598         }
599 }
600
601 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602                             int mss_now, int nonagle)
603 {
604         if (sk->sk_send_head) {
605                 struct sk_buff *skb = sk->sk_write_queue.prev;
606                 if (!(flags & MSG_MORE) || forced_push(tp))
607                         tcp_mark_push(tp, skb);
608                 tcp_mark_urg(tp, flags, skb);
609                 __tcp_push_pending_frames(sk, tp, mss_now,
610                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611         }
612 }
613
614 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615                          size_t psize, int flags)
616 {
617         struct tcp_sock *tp = tcp_sk(sk);
618         int mss_now, size_goal;
619         int err;
620         ssize_t copied;
621         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623         /* Wait for a connection to finish. */
624         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626                         goto out_err;
627
628         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631         size_goal = tp->xmit_size_goal;
632         copied = 0;
633
634         err = -EPIPE;
635         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
636                 goto do_error;
637
638         while (psize > 0) {
639                 struct sk_buff *skb = sk->sk_write_queue.prev;
640                 struct page *page = pages[poffset / PAGE_SIZE];
641                 int copy, i, can_coalesce;
642                 int offset = poffset % PAGE_SIZE;
643                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
644
645                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
646 new_segment:
647                         if (!sk_stream_memory_free(sk))
648                                 goto wait_for_sndbuf;
649
650                         skb = sk_stream_alloc_pskb(sk, 0, 0,
651                                                    sk->sk_allocation);
652                         if (!skb)
653                                 goto wait_for_memory;
654
655                         skb_entail(sk, tp, skb);
656                         copy = size_goal;
657                 }
658
659                 if (copy > size)
660                         copy = size;
661
662                 i = skb_shinfo(skb)->nr_frags;
663                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
664                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
665                         tcp_mark_push(tp, skb);
666                         goto new_segment;
667                 }
668                 if (sk->sk_forward_alloc < copy &&
669                     !sk_stream_mem_schedule(sk, copy, 0))
670                         goto wait_for_memory;
671                 
672                 if (can_coalesce) {
673                         skb_shinfo(skb)->frags[i - 1].size += copy;
674                 } else {
675                         get_page(page);
676                         skb_fill_page_desc(skb, i, page, offset, copy);
677                 }
678
679                 skb->len += copy;
680                 skb->data_len += copy;
681                 skb->truesize += copy;
682                 sk->sk_wmem_queued += copy;
683                 sk->sk_forward_alloc -= copy;
684                 skb->ip_summed = CHECKSUM_HW;
685                 tp->write_seq += copy;
686                 TCP_SKB_CB(skb)->end_seq += copy;
687                 skb_shinfo(skb)->tso_segs = 0;
688
689                 if (!copied)
690                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
691
692                 copied += copy;
693                 poffset += copy;
694                 if (!(psize -= copy))
695                         goto out;
696
697                 if (skb->len < mss_now || (flags & MSG_OOB))
698                         continue;
699
700                 if (forced_push(tp)) {
701                         tcp_mark_push(tp, skb);
702                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
703                 } else if (skb == sk->sk_send_head)
704                         tcp_push_one(sk, mss_now);
705                 continue;
706
707 wait_for_sndbuf:
708                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
709 wait_for_memory:
710                 if (copied)
711                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
712
713                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
714                         goto do_error;
715
716                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717                 size_goal = tp->xmit_size_goal;
718         }
719
720 out:
721         if (copied)
722                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
723         return copied;
724
725 do_error:
726         if (copied)
727                 goto out;
728 out_err:
729         return sk_stream_error(sk, flags, err);
730 }
731
732 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
733                      size_t size, int flags)
734 {
735         ssize_t res;
736         struct sock *sk = sock->sk;
737
738 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
739
740         if (!(sk->sk_route_caps & NETIF_F_SG) ||
741             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
742                 return sock_no_sendpage(sock, page, offset, size, flags);
743
744 #undef TCP_ZC_CSUM_FLAGS
745
746         lock_sock(sk);
747         TCP_CHECK_TIMER(sk);
748         res = do_tcp_sendpages(sk, &page, offset, size, flags);
749         TCP_CHECK_TIMER(sk);
750         release_sock(sk);
751         return res;
752 }
753
754 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
755 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
756
757 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
758 {
759         int tmp = tp->mss_cache;
760
761         if (sk->sk_route_caps & NETIF_F_SG) {
762                 if (sk->sk_route_caps & NETIF_F_TSO)
763                         tmp = 0;
764                 else {
765                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
766
767                         if (tmp >= pgbreak &&
768                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
769                                 tmp = pgbreak;
770                 }
771         }
772
773         return tmp;
774 }
775
776 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
777                 size_t size)
778 {
779         struct iovec *iov;
780         struct tcp_sock *tp = tcp_sk(sk);
781         struct sk_buff *skb;
782         int iovlen, flags;
783         int mss_now, size_goal;
784         int err, copied;
785         long timeo;
786
787         lock_sock(sk);
788         TCP_CHECK_TIMER(sk);
789
790         flags = msg->msg_flags;
791         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
792
793         /* Wait for a connection to finish. */
794         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
795                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
796                         goto out_err;
797
798         /* This should be in poll */
799         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
800
801         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802         size_goal = tp->xmit_size_goal;
803
804         /* Ok commence sending. */
805         iovlen = msg->msg_iovlen;
806         iov = msg->msg_iov;
807         copied = 0;
808
809         err = -EPIPE;
810         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
811                 goto do_error;
812
813         while (--iovlen >= 0) {
814                 int seglen = iov->iov_len;
815                 unsigned char __user *from = iov->iov_base;
816
817                 iov++;
818
819                 while (seglen > 0) {
820                         int copy;
821
822                         skb = sk->sk_write_queue.prev;
823
824                         if (!sk->sk_send_head ||
825                             (copy = size_goal - skb->len) <= 0) {
826
827 new_segment:
828                                 /* Allocate new segment. If the interface is SG,
829                                  * allocate skb fitting to single page.
830                                  */
831                                 if (!sk_stream_memory_free(sk))
832                                         goto wait_for_sndbuf;
833
834                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
835                                                            0, sk->sk_allocation);
836                                 if (!skb)
837                                         goto wait_for_memory;
838
839                                 /*
840                                  * Check whether we can use HW checksum.
841                                  */
842                                 if (sk->sk_route_caps &
843                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
844                                      NETIF_F_HW_CSUM))
845                                         skb->ip_summed = CHECKSUM_HW;
846
847                                 skb_entail(sk, tp, skb);
848                                 copy = size_goal;
849                         }
850
851                         /* Try to append data to the end of skb. */
852                         if (copy > seglen)
853                                 copy = seglen;
854
855                         /* Where to copy to? */
856                         if (skb_tailroom(skb) > 0) {
857                                 /* We have some space in skb head. Superb! */
858                                 if (copy > skb_tailroom(skb))
859                                         copy = skb_tailroom(skb);
860                                 if ((err = skb_add_data(skb, from, copy)) != 0)
861                                         goto do_fault;
862                         } else {
863                                 int merge = 0;
864                                 int i = skb_shinfo(skb)->nr_frags;
865                                 struct page *page = TCP_PAGE(sk);
866                                 int off = TCP_OFF(sk);
867
868                                 if (skb_can_coalesce(skb, i, page, off) &&
869                                     off != PAGE_SIZE) {
870                                         /* We can extend the last page
871                                          * fragment. */
872                                         merge = 1;
873                                 } else if (i == MAX_SKB_FRAGS ||
874                                            (!i &&
875                                            !(sk->sk_route_caps & NETIF_F_SG))) {
876                                         /* Need to add new fragment and cannot
877                                          * do this because interface is non-SG,
878                                          * or because all the page slots are
879                                          * busy. */
880                                         tcp_mark_push(tp, skb);
881                                         goto new_segment;
882                                 } else if (page) {
883                                         if (off == PAGE_SIZE) {
884                                                 put_page(page);
885                                                 TCP_PAGE(sk) = page = NULL;
886                                         }
887                                 }
888
889                                 if (!page) {
890                                         /* Allocate new cache page. */
891                                         if (!(page = sk_stream_alloc_page(sk)))
892                                                 goto wait_for_memory;
893                                         off = 0;
894                                 }
895
896                                 if (copy > PAGE_SIZE - off)
897                                         copy = PAGE_SIZE - off;
898
899                                 /* Time to copy data. We are close to
900                                  * the end! */
901                                 err = skb_copy_to_page(sk, from, skb, page,
902                                                        off, copy);
903                                 if (err) {
904                                         /* If this page was new, give it to the
905                                          * socket so it does not get leaked.
906                                          */
907                                         if (!TCP_PAGE(sk)) {
908                                                 TCP_PAGE(sk) = page;
909                                                 TCP_OFF(sk) = 0;
910                                         }
911                                         goto do_error;
912                                 }
913
914                                 /* Update the skb. */
915                                 if (merge) {
916                                         skb_shinfo(skb)->frags[i - 1].size +=
917                                                                         copy;
918                                 } else {
919                                         skb_fill_page_desc(skb, i, page, off, copy);
920                                         if (TCP_PAGE(sk)) {
921                                                 get_page(page);
922                                         } else if (off + copy < PAGE_SIZE) {
923                                                 get_page(page);
924                                                 TCP_PAGE(sk) = page;
925                                         }
926                                 }
927
928                                 TCP_OFF(sk) = off + copy;
929                         }
930
931                         if (!copied)
932                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
933
934                         tp->write_seq += copy;
935                         TCP_SKB_CB(skb)->end_seq += copy;
936                         skb_shinfo(skb)->tso_segs = 0;
937
938                         from += copy;
939                         copied += copy;
940                         if ((seglen -= copy) == 0 && iovlen == 0)
941                                 goto out;
942
943                         if (skb->len < mss_now || (flags & MSG_OOB))
944                                 continue;
945
946                         if (forced_push(tp)) {
947                                 tcp_mark_push(tp, skb);
948                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
949                         } else if (skb == sk->sk_send_head)
950                                 tcp_push_one(sk, mss_now);
951                         continue;
952
953 wait_for_sndbuf:
954                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
955 wait_for_memory:
956                         if (copied)
957                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
958
959                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
960                                 goto do_error;
961
962                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963                         size_goal = tp->xmit_size_goal;
964                 }
965         }
966
967 out:
968         if (copied)
969                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
970         TCP_CHECK_TIMER(sk);
971         release_sock(sk);
972         return copied;
973
974 do_fault:
975         if (!skb->len) {
976                 if (sk->sk_send_head == skb)
977                         sk->sk_send_head = NULL;
978                 __skb_unlink(skb, skb->list);
979                 sk_stream_free_skb(sk, skb);
980         }
981
982 do_error:
983         if (copied)
984                 goto out;
985 out_err:
986         err = sk_stream_error(sk, flags, err);
987         TCP_CHECK_TIMER(sk);
988         release_sock(sk);
989         return err;
990 }
991
992 /*
993  *      Handle reading urgent data. BSD has very simple semantics for
994  *      this, no blocking and very strange errors 8)
995  */
996
997 static int tcp_recv_urg(struct sock *sk, long timeo,
998                         struct msghdr *msg, int len, int flags,
999                         int *addr_len)
1000 {
1001         struct tcp_sock *tp = tcp_sk(sk);
1002
1003         /* No URG data to read. */
1004         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1005             tp->urg_data == TCP_URG_READ)
1006                 return -EINVAL; /* Yes this is right ! */
1007
1008         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1009                 return -ENOTCONN;
1010
1011         if (tp->urg_data & TCP_URG_VALID) {
1012                 int err = 0;
1013                 char c = tp->urg_data;
1014
1015                 if (!(flags & MSG_PEEK))
1016                         tp->urg_data = TCP_URG_READ;
1017
1018                 /* Read urgent data. */
1019                 msg->msg_flags |= MSG_OOB;
1020
1021                 if (len > 0) {
1022                         if (!(flags & MSG_TRUNC))
1023                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1024                         len = 1;
1025                 } else
1026                         msg->msg_flags |= MSG_TRUNC;
1027
1028                 return err ? -EFAULT : len;
1029         }
1030
1031         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1032                 return 0;
1033
1034         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1035          * the available implementations agree in this case:
1036          * this call should never block, independent of the
1037          * blocking state of the socket.
1038          * Mike <pall@rz.uni-karlsruhe.de>
1039          */
1040         return -EAGAIN;
1041 }
1042
1043 /* Clean up the receive buffer for full frames taken by the user,
1044  * then send an ACK if necessary.  COPIED is the number of bytes
1045  * tcp_recvmsg has given to the user so far, it speeds up the
1046  * calculation of whether or not we must ACK for the sake of
1047  * a window update.
1048  */
1049 static void cleanup_rbuf(struct sock *sk, int copied)
1050 {
1051         struct tcp_sock *tp = tcp_sk(sk);
1052         int time_to_ack = 0;
1053
1054 #if TCP_DEBUG
1055         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1056
1057         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1058 #endif
1059
1060         if (tcp_ack_scheduled(tp)) {
1061                    /* Delayed ACKs frequently hit locked sockets during bulk
1062                     * receive. */
1063                 if (tp->ack.blocked ||
1064                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1065                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1066                     /*
1067                      * If this read emptied read buffer, we send ACK, if
1068                      * connection is not bidirectional, user drained
1069                      * receive buffer and there was a small segment
1070                      * in queue.
1071                      */
1072                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1073                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1074                         time_to_ack = 1;
1075         }
1076
1077         /* We send an ACK if we can now advertise a non-zero window
1078          * which has been raised "significantly".
1079          *
1080          * Even if window raised up to infinity, do not send window open ACK
1081          * in states, where we will not receive more. It is useless.
1082          */
1083         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1084                 __u32 rcv_window_now = tcp_receive_window(tp);
1085
1086                 /* Optimize, __tcp_select_window() is not cheap. */
1087                 if (2*rcv_window_now <= tp->window_clamp) {
1088                         __u32 new_window = __tcp_select_window(sk);
1089
1090                         /* Send ACK now, if this read freed lots of space
1091                          * in our buffer. Certainly, new_window is new window.
1092                          * We can advertise it now, if it is not less than current one.
1093                          * "Lots" means "at least twice" here.
1094                          */
1095                         if (new_window && new_window >= 2 * rcv_window_now)
1096                                 time_to_ack = 1;
1097                 }
1098         }
1099         if (time_to_ack)
1100                 tcp_send_ack(sk);
1101 }
1102
1103 static void tcp_prequeue_process(struct sock *sk)
1104 {
1105         struct sk_buff *skb;
1106         struct tcp_sock *tp = tcp_sk(sk);
1107
1108         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1109
1110         /* RX process wants to run with disabled BHs, though it is not
1111          * necessary */
1112         local_bh_disable();
1113         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1114                 sk->sk_backlog_rcv(sk, skb);
1115         local_bh_enable();
1116
1117         /* Clear memory counter. */
1118         tp->ucopy.memory = 0;
1119 }
1120
1121 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1122 {
1123         struct sk_buff *skb;
1124         u32 offset;
1125
1126         skb_queue_walk(&sk->sk_receive_queue, skb) {
1127                 offset = seq - TCP_SKB_CB(skb)->seq;
1128                 if (skb->h.th->syn)
1129                         offset--;
1130                 if (offset < skb->len || skb->h.th->fin) {
1131                         *off = offset;
1132                         return skb;
1133                 }
1134         }
1135         return NULL;
1136 }
1137
1138 /*
1139  * This routine provides an alternative to tcp_recvmsg() for routines
1140  * that would like to handle copying from skbuffs directly in 'sendfile'
1141  * fashion.
1142  * Note:
1143  *      - It is assumed that the socket was locked by the caller.
1144  *      - The routine does not block.
1145  *      - At present, there is no support for reading OOB data
1146  *        or for 'peeking' the socket using this routine
1147  *        (although both would be easy to implement).
1148  */
1149 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1150                   sk_read_actor_t recv_actor)
1151 {
1152         struct sk_buff *skb;
1153         struct tcp_sock *tp = tcp_sk(sk);
1154         u32 seq = tp->copied_seq;
1155         u32 offset;
1156         int copied = 0;
1157
1158         if (sk->sk_state == TCP_LISTEN)
1159                 return -ENOTCONN;
1160         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1161                 if (offset < skb->len) {
1162                         size_t used, len;
1163
1164                         len = skb->len - offset;
1165                         /* Stop reading if we hit a patch of urgent data */
1166                         if (tp->urg_data) {
1167                                 u32 urg_offset = tp->urg_seq - seq;
1168                                 if (urg_offset < len)
1169                                         len = urg_offset;
1170                                 if (!len)
1171                                         break;
1172                         }
1173                         used = recv_actor(desc, skb, offset, len);
1174                         if (used <= len) {
1175                                 seq += used;
1176                                 copied += used;
1177                                 offset += used;
1178                         }
1179                         if (offset != skb->len)
1180                                 break;
1181                 }
1182                 if (skb->h.th->fin) {
1183                         sk_eat_skb(sk, skb);
1184                         ++seq;
1185                         break;
1186                 }
1187                 sk_eat_skb(sk, skb);
1188                 if (!desc->count)
1189                         break;
1190         }
1191         tp->copied_seq = seq;
1192
1193         tcp_rcv_space_adjust(sk);
1194
1195         /* Clean up data we have read: This will do ACK frames. */
1196         if (copied)
1197                 cleanup_rbuf(sk, copied);
1198         return copied;
1199 }
1200
1201 /*
1202  *      This routine copies from a sock struct into the user buffer.
1203  *
1204  *      Technical note: in 2.3 we work on _locked_ socket, so that
1205  *      tricks with *seq access order and skb->users are not required.
1206  *      Probably, code can be easily improved even more.
1207  */
1208
1209 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1210                 size_t len, int nonblock, int flags, int *addr_len)
1211 {
1212         struct tcp_sock *tp = tcp_sk(sk);
1213         int copied = 0;
1214         u32 peek_seq;
1215         u32 *seq;
1216         unsigned long used;
1217         int err;
1218         int target;             /* Read at least this many bytes */
1219         long timeo;
1220         struct task_struct *user_recv = NULL;
1221
1222         lock_sock(sk);
1223
1224         TCP_CHECK_TIMER(sk);
1225
1226         err = -ENOTCONN;
1227         if (sk->sk_state == TCP_LISTEN)
1228                 goto out;
1229
1230         timeo = sock_rcvtimeo(sk, nonblock);
1231
1232         /* Urgent data needs to be handled specially. */
1233         if (flags & MSG_OOB)
1234                 goto recv_urg;
1235
1236         seq = &tp->copied_seq;
1237         if (flags & MSG_PEEK) {
1238                 peek_seq = tp->copied_seq;
1239                 seq = &peek_seq;
1240         }
1241
1242         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1243
1244         do {
1245                 struct sk_buff *skb;
1246                 u32 offset;
1247
1248                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1249                 if (tp->urg_data && tp->urg_seq == *seq) {
1250                         if (copied)
1251                                 break;
1252                         if (signal_pending(current)) {
1253                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1254                                 break;
1255                         }
1256                 }
1257
1258                 /* Next get a buffer. */
1259
1260                 skb = skb_peek(&sk->sk_receive_queue);
1261                 do {
1262                         if (!skb)
1263                                 break;
1264
1265                         /* Now that we have two receive queues this
1266                          * shouldn't happen.
1267                          */
1268                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1269                                 printk(KERN_INFO "recvmsg bug: copied %X "
1270                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1271                                 break;
1272                         }
1273                         offset = *seq - TCP_SKB_CB(skb)->seq;
1274                         if (skb->h.th->syn)
1275                                 offset--;
1276                         if (offset < skb->len)
1277                                 goto found_ok_skb;
1278                         if (skb->h.th->fin)
1279                                 goto found_fin_ok;
1280                         BUG_TRAP(flags & MSG_PEEK);
1281                         skb = skb->next;
1282                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1283
1284                 /* Well, if we have backlog, try to process it now yet. */
1285
1286                 if (copied >= target && !sk->sk_backlog.tail)
1287                         break;
1288
1289                 if (copied) {
1290                         if (sk->sk_err ||
1291                             sk->sk_state == TCP_CLOSE ||
1292                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1293                             !timeo ||
1294                             signal_pending(current) ||
1295                             (flags & MSG_PEEK))
1296                                 break;
1297                 } else {
1298                         if (sock_flag(sk, SOCK_DONE))
1299                                 break;
1300
1301                         if (sk->sk_err) {
1302                                 copied = sock_error(sk);
1303                                 break;
1304                         }
1305
1306                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1307                                 break;
1308
1309                         if (sk->sk_state == TCP_CLOSE) {
1310                                 if (!sock_flag(sk, SOCK_DONE)) {
1311                                         /* This occurs when user tries to read
1312                                          * from never connected socket.
1313                                          */
1314                                         copied = -ENOTCONN;
1315                                         break;
1316                                 }
1317                                 break;
1318                         }
1319
1320                         if (!timeo) {
1321                                 copied = -EAGAIN;
1322                                 break;
1323                         }
1324
1325                         if (signal_pending(current)) {
1326                                 copied = sock_intr_errno(timeo);
1327                                 break;
1328                         }
1329                 }
1330
1331                 cleanup_rbuf(sk, copied);
1332
1333                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1334                         /* Install new reader */
1335                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1336                                 user_recv = current;
1337                                 tp->ucopy.task = user_recv;
1338                                 tp->ucopy.iov = msg->msg_iov;
1339                         }
1340
1341                         tp->ucopy.len = len;
1342
1343                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1344                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1345
1346                         /* Ugly... If prequeue is not empty, we have to
1347                          * process it before releasing socket, otherwise
1348                          * order will be broken at second iteration.
1349                          * More elegant solution is required!!!
1350                          *
1351                          * Look: we have the following (pseudo)queues:
1352                          *
1353                          * 1. packets in flight
1354                          * 2. backlog
1355                          * 3. prequeue
1356                          * 4. receive_queue
1357                          *
1358                          * Each queue can be processed only if the next ones
1359                          * are empty. At this point we have empty receive_queue.
1360                          * But prequeue _can_ be not empty after 2nd iteration,
1361                          * when we jumped to start of loop because backlog
1362                          * processing added something to receive_queue.
1363                          * We cannot release_sock(), because backlog contains
1364                          * packets arrived _after_ prequeued ones.
1365                          *
1366                          * Shortly, algorithm is clear --- to process all
1367                          * the queues in order. We could make it more directly,
1368                          * requeueing packets from backlog to prequeue, if
1369                          * is not empty. It is more elegant, but eats cycles,
1370                          * unfortunately.
1371                          */
1372                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1373                                 goto do_prequeue;
1374
1375                         /* __ Set realtime policy in scheduler __ */
1376                 }
1377
1378                 if (copied >= target) {
1379                         /* Do not sleep, just process backlog. */
1380                         release_sock(sk);
1381                         lock_sock(sk);
1382                 } else
1383                         sk_wait_data(sk, &timeo);
1384
1385                 if (user_recv) {
1386                         int chunk;
1387
1388                         /* __ Restore normal policy in scheduler __ */
1389
1390                         if ((chunk = len - tp->ucopy.len) != 0) {
1391                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1392                                 len -= chunk;
1393                                 copied += chunk;
1394                         }
1395
1396                         if (tp->rcv_nxt == tp->copied_seq &&
1397                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1398 do_prequeue:
1399                                 tcp_prequeue_process(sk);
1400
1401                                 if ((chunk = len - tp->ucopy.len) != 0) {
1402                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1403                                         len -= chunk;
1404                                         copied += chunk;
1405                                 }
1406                         }
1407                 }
1408                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1409                         if (net_ratelimit())
1410                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1411                                        current->comm, current->pid);
1412                         peek_seq = tp->copied_seq;
1413                 }
1414                 continue;
1415
1416         found_ok_skb:
1417                 /* Ok so how much can we use? */
1418                 used = skb->len - offset;
1419                 if (len < used)
1420                         used = len;
1421
1422                 /* Do we have urgent data here? */
1423                 if (tp->urg_data) {
1424                         u32 urg_offset = tp->urg_seq - *seq;
1425                         if (urg_offset < used) {
1426                                 if (!urg_offset) {
1427                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1428                                                 ++*seq;
1429                                                 offset++;
1430                                                 used--;
1431                                                 if (!used)
1432                                                         goto skip_copy;
1433                                         }
1434                                 } else
1435                                         used = urg_offset;
1436                         }
1437                 }
1438
1439                 if (!(flags & MSG_TRUNC)) {
1440                         err = skb_copy_datagram_iovec(skb, offset,
1441                                                       msg->msg_iov, used);
1442                         if (err) {
1443                                 /* Exception. Bailout! */
1444                                 if (!copied)
1445                                         copied = -EFAULT;
1446                                 break;
1447                         }
1448                 }
1449
1450                 *seq += used;
1451                 copied += used;
1452                 len -= used;
1453
1454                 tcp_rcv_space_adjust(sk);
1455
1456 skip_copy:
1457                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1458                         tp->urg_data = 0;
1459                         tcp_fast_path_check(sk, tp);
1460                 }
1461                 if (used + offset < skb->len)
1462                         continue;
1463
1464                 if (skb->h.th->fin)
1465                         goto found_fin_ok;
1466                 if (!(flags & MSG_PEEK))
1467                         sk_eat_skb(sk, skb);
1468                 continue;
1469
1470         found_fin_ok:
1471                 /* Process the FIN. */
1472                 ++*seq;
1473                 if (!(flags & MSG_PEEK))
1474                         sk_eat_skb(sk, skb);
1475                 break;
1476         } while (len > 0);
1477
1478         if (user_recv) {
1479                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1480                         int chunk;
1481
1482                         tp->ucopy.len = copied > 0 ? len : 0;
1483
1484                         tcp_prequeue_process(sk);
1485
1486                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1487                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1488                                 len -= chunk;
1489                                 copied += chunk;
1490                         }
1491                 }
1492
1493                 tp->ucopy.task = NULL;
1494                 tp->ucopy.len = 0;
1495         }
1496
1497         /* According to UNIX98, msg_name/msg_namelen are ignored
1498          * on connected socket. I was just happy when found this 8) --ANK
1499          */
1500
1501         /* Clean up data we have read: This will do ACK frames. */
1502         cleanup_rbuf(sk, copied);
1503
1504         TCP_CHECK_TIMER(sk);
1505         release_sock(sk);
1506         return copied;
1507
1508 out:
1509         TCP_CHECK_TIMER(sk);
1510         release_sock(sk);
1511         return err;
1512
1513 recv_urg:
1514         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1515         goto out;
1516 }
1517
1518 /*
1519  *      State processing on a close. This implements the state shift for
1520  *      sending our FIN frame. Note that we only send a FIN for some
1521  *      states. A shutdown() may have already sent the FIN, or we may be
1522  *      closed.
1523  */
1524
1525 static unsigned char new_state[16] = {
1526   /* current state:        new state:      action:      */
1527   /* (Invalid)          */ TCP_CLOSE,
1528   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1529   /* TCP_SYN_SENT       */ TCP_CLOSE,
1530   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1531   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1532   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1533   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1534   /* TCP_CLOSE          */ TCP_CLOSE,
1535   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1536   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1537   /* TCP_LISTEN         */ TCP_CLOSE,
1538   /* TCP_CLOSING        */ TCP_CLOSING,
1539 };
1540
1541 static int tcp_close_state(struct sock *sk)
1542 {
1543         int next = (int)new_state[sk->sk_state];
1544         int ns = next & TCP_STATE_MASK;
1545
1546         tcp_set_state(sk, ns);
1547
1548         return next & TCP_ACTION_FIN;
1549 }
1550
1551 /*
1552  *      Shutdown the sending side of a connection. Much like close except
1553  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1554  */
1555
1556 void tcp_shutdown(struct sock *sk, int how)
1557 {
1558         /*      We need to grab some memory, and put together a FIN,
1559          *      and then put it into the queue to be sent.
1560          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1561          */
1562         if (!(how & SEND_SHUTDOWN))
1563                 return;
1564
1565         /* If we've already sent a FIN, or it's a closed state, skip this. */
1566         if ((1 << sk->sk_state) &
1567             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1568              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1569                 /* Clear out any half completed packets.  FIN if needed. */
1570                 if (tcp_close_state(sk))
1571                         tcp_send_fin(sk);
1572         }
1573 }
1574
1575 /*
1576  * At this point, there should be no process reference to this
1577  * socket, and thus no user references at all.  Therefore we
1578  * can assume the socket waitqueue is inactive and nobody will
1579  * try to jump onto it.
1580  */
1581 void tcp_destroy_sock(struct sock *sk)
1582 {
1583         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1584         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1585
1586         /* It cannot be in hash table! */
1587         BUG_TRAP(sk_unhashed(sk));
1588
1589         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1590         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1591
1592         sk->sk_prot->destroy(sk);
1593
1594         sk_stream_kill_queues(sk);
1595
1596         xfrm_sk_free_policy(sk);
1597
1598 #ifdef INET_REFCNT_DEBUG
1599         if (atomic_read(&sk->sk_refcnt) != 1) {
1600                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1601                        sk, atomic_read(&sk->sk_refcnt));
1602         }
1603 #endif
1604
1605         atomic_dec(&tcp_orphan_count);
1606         sock_put(sk);
1607 }
1608
1609 void tcp_close(struct sock *sk, long timeout)
1610 {
1611         struct sk_buff *skb;
1612         int data_was_unread = 0;
1613
1614         lock_sock(sk);
1615         sk->sk_shutdown = SHUTDOWN_MASK;
1616
1617         if (sk->sk_state == TCP_LISTEN) {
1618                 tcp_set_state(sk, TCP_CLOSE);
1619
1620                 /* Special case. */
1621                 tcp_listen_stop(sk);
1622
1623                 goto adjudge_to_death;
1624         }
1625
1626         /*  We need to flush the recv. buffs.  We do this only on the
1627          *  descriptor close, not protocol-sourced closes, because the
1628          *  reader process may not have drained the data yet!
1629          */
1630         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1631                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1632                           skb->h.th->fin;
1633                 data_was_unread += len;
1634                 __kfree_skb(skb);
1635         }
1636
1637         sk_stream_mem_reclaim(sk);
1638
1639         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1640          * 3.10, we send a RST here because data was lost.  To
1641          * witness the awful effects of the old behavior of always
1642          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1643          * a bulk GET in an FTP client, suspend the process, wait
1644          * for the client to advertise a zero window, then kill -9
1645          * the FTP client, wheee...  Note: timeout is always zero
1646          * in such a case.
1647          */
1648         if (data_was_unread) {
1649                 /* Unread data was tossed, zap the connection. */
1650                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1651                 tcp_set_state(sk, TCP_CLOSE);
1652                 tcp_send_active_reset(sk, GFP_KERNEL);
1653         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1654                 /* Check zero linger _after_ checking for unread data. */
1655                 sk->sk_prot->disconnect(sk, 0);
1656                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1657         } else if (tcp_close_state(sk)) {
1658                 /* We FIN if the application ate all the data before
1659                  * zapping the connection.
1660                  */
1661
1662                 /* RED-PEN. Formally speaking, we have broken TCP state
1663                  * machine. State transitions:
1664                  *
1665                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1666                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1667                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1668                  *
1669                  * are legal only when FIN has been sent (i.e. in window),
1670                  * rather than queued out of window. Purists blame.
1671                  *
1672                  * F.e. "RFC state" is ESTABLISHED,
1673                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1674                  *
1675                  * The visible declinations are that sometimes
1676                  * we enter time-wait state, when it is not required really
1677                  * (harmless), do not send active resets, when they are
1678                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1679                  * they look as CLOSING or LAST_ACK for Linux)
1680                  * Probably, I missed some more holelets.
1681                  *                                              --ANK
1682                  */
1683                 tcp_send_fin(sk);
1684         }
1685
1686         sk_stream_wait_close(sk, timeout);
1687
1688 adjudge_to_death:
1689         /* It is the last release_sock in its life. It will remove backlog. */
1690         release_sock(sk);
1691
1692
1693         /* Now socket is owned by kernel and we acquire BH lock
1694            to finish close. No need to check for user refs.
1695          */
1696         local_bh_disable();
1697         bh_lock_sock(sk);
1698         BUG_TRAP(!sock_owned_by_user(sk));
1699
1700         sock_hold(sk);
1701         sock_orphan(sk);
1702
1703         /*      This is a (useful) BSD violating of the RFC. There is a
1704          *      problem with TCP as specified in that the other end could
1705          *      keep a socket open forever with no application left this end.
1706          *      We use a 3 minute timeout (about the same as BSD) then kill
1707          *      our end. If they send after that then tough - BUT: long enough
1708          *      that we won't make the old 4*rto = almost no time - whoops
1709          *      reset mistake.
1710          *
1711          *      Nope, it was not mistake. It is really desired behaviour
1712          *      f.e. on http servers, when such sockets are useless, but
1713          *      consume significant resources. Let's do it with special
1714          *      linger2 option.                                 --ANK
1715          */
1716
1717         if (sk->sk_state == TCP_FIN_WAIT2) {
1718                 struct tcp_sock *tp = tcp_sk(sk);
1719                 if (tp->linger2 < 0) {
1720                         tcp_set_state(sk, TCP_CLOSE);
1721                         tcp_send_active_reset(sk, GFP_ATOMIC);
1722                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1723                 } else {
1724                         int tmo = tcp_fin_time(tp);
1725
1726                         if (tmo > TCP_TIMEWAIT_LEN) {
1727                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1728                         } else {
1729                                 atomic_inc(&tcp_orphan_count);
1730                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1731                                 goto out;
1732                         }
1733                 }
1734         }
1735         if (sk->sk_state != TCP_CLOSE) {
1736                 sk_stream_mem_reclaim(sk);
1737                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1738                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1739                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1740                         if (net_ratelimit())
1741                                 printk(KERN_INFO "TCP: too many of orphaned "
1742                                        "sockets\n");
1743                         tcp_set_state(sk, TCP_CLOSE);
1744                         tcp_send_active_reset(sk, GFP_ATOMIC);
1745                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1746                 }
1747         }
1748         atomic_inc(&tcp_orphan_count);
1749
1750         if (sk->sk_state == TCP_CLOSE)
1751                 tcp_destroy_sock(sk);
1752         /* Otherwise, socket is reprieved until protocol close. */
1753
1754 out:
1755         bh_unlock_sock(sk);
1756         local_bh_enable();
1757         sock_put(sk);
1758 }
1759
1760 /* These states need RST on ABORT according to RFC793 */
1761
1762 static inline int tcp_need_reset(int state)
1763 {
1764         return (1 << state) &
1765                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1766                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1767 }
1768
1769 int tcp_disconnect(struct sock *sk, int flags)
1770 {
1771         struct inet_sock *inet = inet_sk(sk);
1772         struct tcp_sock *tp = tcp_sk(sk);
1773         int err = 0;
1774         int old_state = sk->sk_state;
1775
1776         if (old_state != TCP_CLOSE)
1777                 tcp_set_state(sk, TCP_CLOSE);
1778
1779         /* ABORT function of RFC793 */
1780         if (old_state == TCP_LISTEN) {
1781                 tcp_listen_stop(sk);
1782         } else if (tcp_need_reset(old_state) ||
1783                    (tp->snd_nxt != tp->write_seq &&
1784                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1785                 /* The last check adjusts for discrepance of Linux wrt. RFC
1786                  * states
1787                  */
1788                 tcp_send_active_reset(sk, gfp_any());
1789                 sk->sk_err = ECONNRESET;
1790         } else if (old_state == TCP_SYN_SENT)
1791                 sk->sk_err = ECONNRESET;
1792
1793         tcp_clear_xmit_timers(sk);
1794         __skb_queue_purge(&sk->sk_receive_queue);
1795         sk_stream_writequeue_purge(sk);
1796         __skb_queue_purge(&tp->out_of_order_queue);
1797
1798         inet->dport = 0;
1799
1800         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1801                 inet_reset_saddr(sk);
1802
1803         sk->sk_shutdown = 0;
1804         sock_reset_flag(sk, SOCK_DONE);
1805         tp->srtt = 0;
1806         if ((tp->write_seq += tp->max_window + 2) == 0)
1807                 tp->write_seq = 1;
1808         tp->backoff = 0;
1809         tp->snd_cwnd = 2;
1810         tp->probes_out = 0;
1811         tp->packets_out = 0;
1812         tp->snd_ssthresh = 0x7fffffff;
1813         tp->snd_cwnd_cnt = 0;
1814         tcp_set_ca_state(tp, TCP_CA_Open);
1815         tcp_clear_retrans(tp);
1816         tcp_delack_init(tp);
1817         sk->sk_send_head = NULL;
1818         tp->rx_opt.saw_tstamp = 0;
1819         tcp_sack_reset(&tp->rx_opt);
1820         __sk_dst_reset(sk);
1821
1822         BUG_TRAP(!inet->num || tp->bind_hash);
1823
1824         sk->sk_error_report(sk);
1825         return err;
1826 }
1827
1828 /*
1829  *      Wait for an incoming connection, avoid race
1830  *      conditions. This must be called with the socket locked.
1831  */
1832 static int wait_for_connect(struct sock *sk, long timeo)
1833 {
1834         struct tcp_sock *tp = tcp_sk(sk);
1835         DEFINE_WAIT(wait);
1836         int err;
1837
1838         /*
1839          * True wake-one mechanism for incoming connections: only
1840          * one process gets woken up, not the 'whole herd'.
1841          * Since we do not 'race & poll' for established sockets
1842          * anymore, the common case will execute the loop only once.
1843          *
1844          * Subtle issue: "add_wait_queue_exclusive()" will be added
1845          * after any current non-exclusive waiters, and we know that
1846          * it will always _stay_ after any new non-exclusive waiters
1847          * because all non-exclusive waiters are added at the
1848          * beginning of the wait-queue. As such, it's ok to "drop"
1849          * our exclusiveness temporarily when we get woken up without
1850          * having to remove and re-insert us on the wait queue.
1851          */
1852         for (;;) {
1853                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1854                                           TASK_INTERRUPTIBLE);
1855                 release_sock(sk);
1856                 if (reqsk_queue_empty(&tp->accept_queue))
1857                         timeo = schedule_timeout(timeo);
1858                 lock_sock(sk);
1859                 err = 0;
1860                 if (!reqsk_queue_empty(&tp->accept_queue))
1861                         break;
1862                 err = -EINVAL;
1863                 if (sk->sk_state != TCP_LISTEN)
1864                         break;
1865                 err = sock_intr_errno(timeo);
1866                 if (signal_pending(current))
1867                         break;
1868                 err = -EAGAIN;
1869                 if (!timeo)
1870                         break;
1871         }
1872         finish_wait(sk->sk_sleep, &wait);
1873         return err;
1874 }
1875
1876 /*
1877  *      This will accept the next outstanding connection.
1878  */
1879
1880 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1881 {
1882         struct tcp_sock *tp = tcp_sk(sk);
1883         struct sock *newsk;
1884         int error;
1885
1886         lock_sock(sk);
1887
1888         /* We need to make sure that this socket is listening,
1889          * and that it has something pending.
1890          */
1891         error = -EINVAL;
1892         if (sk->sk_state != TCP_LISTEN)
1893                 goto out_err;
1894
1895         /* Find already established connection */
1896         if (reqsk_queue_empty(&tp->accept_queue)) {
1897                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1898
1899                 /* If this is a non blocking socket don't sleep */
1900                 error = -EAGAIN;
1901                 if (!timeo)
1902                         goto out_err;
1903
1904                 error = wait_for_connect(sk, timeo);
1905                 if (error)
1906                         goto out_err;
1907         }
1908
1909         newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1910         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1911 out:
1912         release_sock(sk);
1913         return newsk;
1914 out_err:
1915         newsk = NULL;
1916         *err = error;
1917         goto out;
1918 }
1919
1920 /*
1921  *      Socket option code for TCP.
1922  */
1923 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1924                    int optlen)
1925 {
1926         struct tcp_sock *tp = tcp_sk(sk);
1927         int val;
1928         int err = 0;
1929
1930         if (level != SOL_TCP)
1931                 return tp->af_specific->setsockopt(sk, level, optname,
1932                                                    optval, optlen);
1933
1934         /* This is a string value all the others are int's */
1935         if (optname == TCP_CONGESTION) {
1936                 char name[TCP_CA_NAME_MAX];
1937
1938                 if (optlen < 1)
1939                         return -EINVAL;
1940
1941                 val = strncpy_from_user(name, optval,
1942                                         min(TCP_CA_NAME_MAX-1, optlen));
1943                 if (val < 0)
1944                         return -EFAULT;
1945                 name[val] = 0;
1946
1947                 lock_sock(sk);
1948                 err = tcp_set_congestion_control(tp, name);
1949                 release_sock(sk);
1950                 return err;
1951         }
1952
1953         if (optlen < sizeof(int))
1954                 return -EINVAL;
1955
1956         if (get_user(val, (int __user *)optval))
1957                 return -EFAULT;
1958
1959         lock_sock(sk);
1960
1961         switch (optname) {
1962         case TCP_MAXSEG:
1963                 /* Values greater than interface MTU won't take effect. However
1964                  * at the point when this call is done we typically don't yet
1965                  * know which interface is going to be used */
1966                 if (val < 8 || val > MAX_TCP_WINDOW) {
1967                         err = -EINVAL;
1968                         break;
1969                 }
1970                 tp->rx_opt.user_mss = val;
1971                 break;
1972
1973         case TCP_NODELAY:
1974                 if (val) {
1975                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1976                          * this option on corked socket is remembered, but
1977                          * it is not activated until cork is cleared.
1978                          *
1979                          * However, when TCP_NODELAY is set we make
1980                          * an explicit push, which overrides even TCP_CORK
1981                          * for currently queued segments.
1982                          */
1983                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1984                         tcp_push_pending_frames(sk, tp);
1985                 } else {
1986                         tp->nonagle &= ~TCP_NAGLE_OFF;
1987                 }
1988                 break;
1989
1990         case TCP_CORK:
1991                 /* When set indicates to always queue non-full frames.
1992                  * Later the user clears this option and we transmit
1993                  * any pending partial frames in the queue.  This is
1994                  * meant to be used alongside sendfile() to get properly
1995                  * filled frames when the user (for example) must write
1996                  * out headers with a write() call first and then use
1997                  * sendfile to send out the data parts.
1998                  *
1999                  * TCP_CORK can be set together with TCP_NODELAY and it is
2000                  * stronger than TCP_NODELAY.
2001                  */
2002                 if (val) {
2003                         tp->nonagle |= TCP_NAGLE_CORK;
2004                 } else {
2005                         tp->nonagle &= ~TCP_NAGLE_CORK;
2006                         if (tp->nonagle&TCP_NAGLE_OFF)
2007                                 tp->nonagle |= TCP_NAGLE_PUSH;
2008                         tcp_push_pending_frames(sk, tp);
2009                 }
2010                 break;
2011
2012         case TCP_KEEPIDLE:
2013                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2014                         err = -EINVAL;
2015                 else {
2016                         tp->keepalive_time = val * HZ;
2017                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2018                             !((1 << sk->sk_state) &
2019                               (TCPF_CLOSE | TCPF_LISTEN))) {
2020                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2021                                 if (tp->keepalive_time > elapsed)
2022                                         elapsed = tp->keepalive_time - elapsed;
2023                                 else
2024                                         elapsed = 0;
2025                                 tcp_reset_keepalive_timer(sk, elapsed);
2026                         }
2027                 }
2028                 break;
2029         case TCP_KEEPINTVL:
2030                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2031                         err = -EINVAL;
2032                 else
2033                         tp->keepalive_intvl = val * HZ;
2034                 break;
2035         case TCP_KEEPCNT:
2036                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2037                         err = -EINVAL;
2038                 else
2039                         tp->keepalive_probes = val;
2040                 break;
2041         case TCP_SYNCNT:
2042                 if (val < 1 || val > MAX_TCP_SYNCNT)
2043                         err = -EINVAL;
2044                 else
2045                         tp->syn_retries = val;
2046                 break;
2047
2048         case TCP_LINGER2:
2049                 if (val < 0)
2050                         tp->linger2 = -1;
2051                 else if (val > sysctl_tcp_fin_timeout / HZ)
2052                         tp->linger2 = 0;
2053                 else
2054                         tp->linger2 = val * HZ;
2055                 break;
2056
2057         case TCP_DEFER_ACCEPT:
2058                 tp->defer_accept = 0;
2059                 if (val > 0) {
2060                         /* Translate value in seconds to number of
2061                          * retransmits */
2062                         while (tp->defer_accept < 32 &&
2063                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2064                                        tp->defer_accept))
2065                                 tp->defer_accept++;
2066                         tp->defer_accept++;
2067                 }
2068                 break;
2069
2070         case TCP_WINDOW_CLAMP:
2071                 if (!val) {
2072                         if (sk->sk_state != TCP_CLOSE) {
2073                                 err = -EINVAL;
2074                                 break;
2075                         }
2076                         tp->window_clamp = 0;
2077                 } else
2078                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2079                                                 SOCK_MIN_RCVBUF / 2 : val;
2080                 break;
2081
2082         case TCP_QUICKACK:
2083                 if (!val) {
2084                         tp->ack.pingpong = 1;
2085                 } else {
2086                         tp->ack.pingpong = 0;
2087                         if ((1 << sk->sk_state) &
2088                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2089                             tcp_ack_scheduled(tp)) {
2090                                 tp->ack.pending |= TCP_ACK_PUSHED;
2091                                 cleanup_rbuf(sk, 1);
2092                                 if (!(val & 1))
2093                                         tp->ack.pingpong = 1;
2094                         }
2095                 }
2096                 break;
2097
2098         default:
2099                 err = -ENOPROTOOPT;
2100                 break;
2101         };
2102         release_sock(sk);
2103         return err;
2104 }
2105
2106 /* Return information about state of tcp endpoint in API format. */
2107 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2108 {
2109         struct tcp_sock *tp = tcp_sk(sk);
2110         u32 now = tcp_time_stamp;
2111
2112         memset(info, 0, sizeof(*info));
2113
2114         info->tcpi_state = sk->sk_state;
2115         info->tcpi_ca_state = tp->ca_state;
2116         info->tcpi_retransmits = tp->retransmits;
2117         info->tcpi_probes = tp->probes_out;
2118         info->tcpi_backoff = tp->backoff;
2119
2120         if (tp->rx_opt.tstamp_ok)
2121                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2122         if (tp->rx_opt.sack_ok)
2123                 info->tcpi_options |= TCPI_OPT_SACK;
2124         if (tp->rx_opt.wscale_ok) {
2125                 info->tcpi_options |= TCPI_OPT_WSCALE;
2126                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2127                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2128         } 
2129
2130         if (tp->ecn_flags&TCP_ECN_OK)
2131                 info->tcpi_options |= TCPI_OPT_ECN;
2132
2133         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2134         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2135         info->tcpi_snd_mss = tp->mss_cache;
2136         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2137
2138         info->tcpi_unacked = tp->packets_out;
2139         info->tcpi_sacked = tp->sacked_out;
2140         info->tcpi_lost = tp->lost_out;
2141         info->tcpi_retrans = tp->retrans_out;
2142         info->tcpi_fackets = tp->fackets_out;
2143
2144         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2145         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2146         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2147
2148         info->tcpi_pmtu = tp->pmtu_cookie;
2149         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2150         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2151         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2152         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2153         info->tcpi_snd_cwnd = tp->snd_cwnd;
2154         info->tcpi_advmss = tp->advmss;
2155         info->tcpi_reordering = tp->reordering;
2156
2157         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2158         info->tcpi_rcv_space = tp->rcvq_space.space;
2159
2160         info->tcpi_total_retrans = tp->total_retrans;
2161 }
2162
2163 EXPORT_SYMBOL_GPL(tcp_get_info);
2164
2165 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2166                    int __user *optlen)
2167 {
2168         struct tcp_sock *tp = tcp_sk(sk);
2169         int val, len;
2170
2171         if (level != SOL_TCP)
2172                 return tp->af_specific->getsockopt(sk, level, optname,
2173                                                    optval, optlen);
2174
2175         if (get_user(len, optlen))
2176                 return -EFAULT;
2177
2178         len = min_t(unsigned int, len, sizeof(int));
2179
2180         if (len < 0)
2181                 return -EINVAL;
2182
2183         switch (optname) {
2184         case TCP_MAXSEG:
2185                 val = tp->mss_cache;
2186                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2187                         val = tp->rx_opt.user_mss;
2188                 break;
2189         case TCP_NODELAY:
2190                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2191                 break;
2192         case TCP_CORK:
2193                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2194                 break;
2195         case TCP_KEEPIDLE:
2196                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2197                 break;
2198         case TCP_KEEPINTVL:
2199                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2200                 break;
2201         case TCP_KEEPCNT:
2202                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2203                 break;
2204         case TCP_SYNCNT:
2205                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2206                 break;
2207         case TCP_LINGER2:
2208                 val = tp->linger2;
2209                 if (val >= 0)
2210                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2211                 break;
2212         case TCP_DEFER_ACCEPT:
2213                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2214                                                (tp->defer_accept - 1));
2215                 break;
2216         case TCP_WINDOW_CLAMP:
2217                 val = tp->window_clamp;
2218                 break;
2219         case TCP_INFO: {
2220                 struct tcp_info info;
2221
2222                 if (get_user(len, optlen))
2223                         return -EFAULT;
2224
2225                 tcp_get_info(sk, &info);
2226
2227                 len = min_t(unsigned int, len, sizeof(info));
2228                 if (put_user(len, optlen))
2229                         return -EFAULT;
2230                 if (copy_to_user(optval, &info, len))
2231                         return -EFAULT;
2232                 return 0;
2233         }
2234         case TCP_QUICKACK:
2235                 val = !tp->ack.pingpong;
2236                 break;
2237
2238         case TCP_CONGESTION:
2239                 if (get_user(len, optlen))
2240                         return -EFAULT;
2241                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2242                 if (put_user(len, optlen))
2243                         return -EFAULT;
2244                 if (copy_to_user(optval, tp->ca_ops->name, len))
2245                         return -EFAULT;
2246                 return 0;
2247         default:
2248                 return -ENOPROTOOPT;
2249         };
2250
2251         if (put_user(len, optlen))
2252                 return -EFAULT;
2253         if (copy_to_user(optval, &val, len))
2254                 return -EFAULT;
2255         return 0;
2256 }
2257
2258
2259 extern void __skb_cb_too_small_for_tcp(int, int);
2260 extern struct tcp_congestion_ops tcp_reno;
2261
2262 static __initdata unsigned long thash_entries;
2263 static int __init set_thash_entries(char *str)
2264 {
2265         if (!str)
2266                 return 0;
2267         thash_entries = simple_strtoul(str, &str, 0);
2268         return 1;
2269 }
2270 __setup("thash_entries=", set_thash_entries);
2271
2272 void __init tcp_init(void)
2273 {
2274         struct sk_buff *skb = NULL;
2275         int order, i;
2276
2277         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2278                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2279                                            sizeof(skb->cb));
2280
2281         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2282                                               sizeof(struct tcp_bind_bucket),
2283                                               0, SLAB_HWCACHE_ALIGN,
2284                                               NULL, NULL);
2285         if (!tcp_bucket_cachep)
2286                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2287
2288         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2289                                                 sizeof(struct tcp_tw_bucket),
2290                                                 0, SLAB_HWCACHE_ALIGN,
2291                                                 NULL, NULL);
2292         if (!tcp_timewait_cachep)
2293                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2294
2295         /* Size and allocate the main established and bind bucket
2296          * hash tables.
2297          *
2298          * The methodology is similar to that of the buffer cache.
2299          */
2300         tcp_ehash = (struct tcp_ehash_bucket *)
2301                 alloc_large_system_hash("TCP established",
2302                                         sizeof(struct tcp_ehash_bucket),
2303                                         thash_entries,
2304                                         (num_physpages >= 128 * 1024) ?
2305                                                 (25 - PAGE_SHIFT) :
2306                                                 (27 - PAGE_SHIFT),
2307                                         HASH_HIGHMEM,
2308                                         &tcp_ehash_size,
2309                                         NULL,
2310                                         0);
2311         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2312         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2313                 rwlock_init(&tcp_ehash[i].lock);
2314                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2315         }
2316
2317         tcp_bhash = (struct tcp_bind_hashbucket *)
2318                 alloc_large_system_hash("TCP bind",
2319                                         sizeof(struct tcp_bind_hashbucket),
2320                                         tcp_ehash_size,
2321                                         (num_physpages >= 128 * 1024) ?
2322                                                 (25 - PAGE_SHIFT) :
2323                                                 (27 - PAGE_SHIFT),
2324                                         HASH_HIGHMEM,
2325                                         &tcp_bhash_size,
2326                                         NULL,
2327                                         64 * 1024);
2328         tcp_bhash_size = 1 << tcp_bhash_size;
2329         for (i = 0; i < tcp_bhash_size; i++) {
2330                 spin_lock_init(&tcp_bhash[i].lock);
2331                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2332         }
2333
2334         /* Try to be a bit smarter and adjust defaults depending
2335          * on available memory.
2336          */
2337         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2338                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2339                         order++)
2340                 ;
2341         if (order >= 4) {
2342                 sysctl_local_port_range[0] = 32768;
2343                 sysctl_local_port_range[1] = 61000;
2344                 sysctl_tcp_max_tw_buckets = 180000;
2345                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2346                 sysctl_max_syn_backlog = 1024;
2347         } else if (order < 3) {
2348                 sysctl_local_port_range[0] = 1024 * (3 - order);
2349                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2350                 sysctl_tcp_max_orphans >>= (3 - order);
2351                 sysctl_max_syn_backlog = 128;
2352         }
2353         tcp_port_rover = sysctl_local_port_range[0] - 1;
2354
2355         sysctl_tcp_mem[0] =  768 << order;
2356         sysctl_tcp_mem[1] = 1024 << order;
2357         sysctl_tcp_mem[2] = 1536 << order;
2358
2359         if (order < 3) {
2360                 sysctl_tcp_wmem[2] = 64 * 1024;
2361                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2362                 sysctl_tcp_rmem[1] = 43689;
2363                 sysctl_tcp_rmem[2] = 2 * 43689;
2364         }
2365
2366         printk(KERN_INFO "TCP: Hash tables configured "
2367                "(established %d bind %d)\n",
2368                tcp_ehash_size << 1, tcp_bhash_size);
2369
2370         tcp_register_congestion_control(&tcp_reno);
2371 }
2372
2373 EXPORT_SYMBOL(tcp_accept);
2374 EXPORT_SYMBOL(tcp_close);
2375 EXPORT_SYMBOL(tcp_destroy_sock);
2376 EXPORT_SYMBOL(tcp_disconnect);
2377 EXPORT_SYMBOL(tcp_getsockopt);
2378 EXPORT_SYMBOL(tcp_ioctl);
2379 EXPORT_SYMBOL(tcp_poll);
2380 EXPORT_SYMBOL(tcp_read_sock);
2381 EXPORT_SYMBOL(tcp_recvmsg);
2382 EXPORT_SYMBOL(tcp_sendmsg);
2383 EXPORT_SYMBOL(tcp_sendpage);
2384 EXPORT_SYMBOL(tcp_setsockopt);
2385 EXPORT_SYMBOL(tcp_shutdown);
2386 EXPORT_SYMBOL(tcp_statistics);
2387 EXPORT_SYMBOL(tcp_timewait_cachep);