Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/dtor/input.git manually
[linux-2.6] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_bucket_cachep;
275 kmem_cache_t *tcp_timewait_cachep;
276
277 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
278
279 int sysctl_tcp_mem[3];
280 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
281 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
282
283 EXPORT_SYMBOL(sysctl_tcp_mem);
284 EXPORT_SYMBOL(sysctl_tcp_rmem);
285 EXPORT_SYMBOL(sysctl_tcp_wmem);
286
287 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
288 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
289
290 EXPORT_SYMBOL(tcp_memory_allocated);
291 EXPORT_SYMBOL(tcp_sockets_allocated);
292
293 /*
294  * Pressure flag: try to collapse.
295  * Technical note: it is used by multiple contexts non atomically.
296  * All the sk_stream_mem_schedule() is of this nature: accounting
297  * is strict, actions are advisory and have some latency.
298  */
299 int tcp_memory_pressure;
300
301 EXPORT_SYMBOL(tcp_memory_pressure);
302
303 void tcp_enter_memory_pressure(void)
304 {
305         if (!tcp_memory_pressure) {
306                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
307                 tcp_memory_pressure = 1;
308         }
309 }
310
311 EXPORT_SYMBOL(tcp_enter_memory_pressure);
312
313 /*
314  * LISTEN is a special case for poll..
315  */
316 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
317                                                poll_table *wait)
318 {
319         return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
320 }
321
322 /*
323  *      Wait for a TCP event.
324  *
325  *      Note that we don't need to lock the socket, as the upper poll layers
326  *      take care of normal races (between the test and the event) and we don't
327  *      go look at any of the socket buffers directly.
328  */
329 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
330 {
331         unsigned int mask;
332         struct sock *sk = sock->sk;
333         struct tcp_sock *tp = tcp_sk(sk);
334
335         poll_wait(file, sk->sk_sleep, wait);
336         if (sk->sk_state == TCP_LISTEN)
337                 return tcp_listen_poll(sk, wait);
338
339         /* Socket is not locked. We are protected from async events
340            by poll logic and correct handling of state changes
341            made by another threads is impossible in any case.
342          */
343
344         mask = 0;
345         if (sk->sk_err)
346                 mask = POLLERR;
347
348         /*
349          * POLLHUP is certainly not done right. But poll() doesn't
350          * have a notion of HUP in just one direction, and for a
351          * socket the read side is more interesting.
352          *
353          * Some poll() documentation says that POLLHUP is incompatible
354          * with the POLLOUT/POLLWR flags, so somebody should check this
355          * all. But careful, it tends to be safer to return too many
356          * bits than too few, and you can easily break real applications
357          * if you don't tell them that something has hung up!
358          *
359          * Check-me.
360          *
361          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
362          * our fs/select.c). It means that after we received EOF,
363          * poll always returns immediately, making impossible poll() on write()
364          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
365          * if and only if shutdown has been made in both directions.
366          * Actually, it is interesting to look how Solaris and DUX
367          * solve this dilemma. I would prefer, if PULLHUP were maskable,
368          * then we could set it on SND_SHUTDOWN. BTW examples given
369          * in Stevens' books assume exactly this behaviour, it explains
370          * why PULLHUP is incompatible with POLLOUT.    --ANK
371          *
372          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
373          * blocking on fresh not-connected or disconnected socket. --ANK
374          */
375         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
376                 mask |= POLLHUP;
377         if (sk->sk_shutdown & RCV_SHUTDOWN)
378                 mask |= POLLIN | POLLRDNORM;
379
380         /* Connected? */
381         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
382                 /* Potential race condition. If read of tp below will
383                  * escape above sk->sk_state, we can be illegally awaken
384                  * in SYN_* states. */
385                 if ((tp->rcv_nxt != tp->copied_seq) &&
386                     (tp->urg_seq != tp->copied_seq ||
387                      tp->rcv_nxt != tp->copied_seq + 1 ||
388                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
389                         mask |= POLLIN | POLLRDNORM;
390
391                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
392                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
393                                 mask |= POLLOUT | POLLWRNORM;
394                         } else {  /* send SIGIO later */
395                                 set_bit(SOCK_ASYNC_NOSPACE,
396                                         &sk->sk_socket->flags);
397                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
398
399                                 /* Race breaker. If space is freed after
400                                  * wspace test but before the flags are set,
401                                  * IO signal will be lost.
402                                  */
403                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
404                                         mask |= POLLOUT | POLLWRNORM;
405                         }
406                 }
407
408                 if (tp->urg_data & TCP_URG_VALID)
409                         mask |= POLLPRI;
410         }
411         return mask;
412 }
413
414 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
415 {
416         struct tcp_sock *tp = tcp_sk(sk);
417         int answ;
418
419         switch (cmd) {
420         case SIOCINQ:
421                 if (sk->sk_state == TCP_LISTEN)
422                         return -EINVAL;
423
424                 lock_sock(sk);
425                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
426                         answ = 0;
427                 else if (sock_flag(sk, SOCK_URGINLINE) ||
428                          !tp->urg_data ||
429                          before(tp->urg_seq, tp->copied_seq) ||
430                          !before(tp->urg_seq, tp->rcv_nxt)) {
431                         answ = tp->rcv_nxt - tp->copied_seq;
432
433                         /* Subtract 1, if FIN is in queue. */
434                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
435                                 answ -=
436                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
437                 } else
438                         answ = tp->urg_seq - tp->copied_seq;
439                 release_sock(sk);
440                 break;
441         case SIOCATMARK:
442                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
443                 break;
444         case SIOCOUTQ:
445                 if (sk->sk_state == TCP_LISTEN)
446                         return -EINVAL;
447
448                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
449                         answ = 0;
450                 else
451                         answ = tp->write_seq - tp->snd_una;
452                 break;
453         default:
454                 return -ENOIOCTLCMD;
455         };
456
457         return put_user(answ, (int __user *)arg);
458 }
459
460
461 int tcp_listen_start(struct sock *sk)
462 {
463         struct inet_sock *inet = inet_sk(sk);
464         struct tcp_sock *tp = tcp_sk(sk);
465         int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
466
467         if (rc != 0)
468                 return rc;
469
470         sk->sk_max_ack_backlog = 0;
471         sk->sk_ack_backlog = 0;
472         tcp_delack_init(tp);
473
474         /* There is race window here: we announce ourselves listening,
475          * but this transition is still not validated by get_port().
476          * It is OK, because this socket enters to hash table only
477          * after validation is complete.
478          */
479         sk->sk_state = TCP_LISTEN;
480         if (!sk->sk_prot->get_port(sk, inet->num)) {
481                 inet->sport = htons(inet->num);
482
483                 sk_dst_reset(sk);
484                 sk->sk_prot->hash(sk);
485
486                 return 0;
487         }
488
489         sk->sk_state = TCP_CLOSE;
490         reqsk_queue_destroy(&tp->accept_queue);
491         return -EADDRINUSE;
492 }
493
494 /*
495  *      This routine closes sockets which have been at least partially
496  *      opened, but not yet accepted.
497  */
498
499 static void tcp_listen_stop (struct sock *sk)
500 {
501         struct tcp_sock *tp = tcp_sk(sk);
502         struct listen_sock *lopt;
503         struct request_sock *acc_req;
504         struct request_sock *req;
505         int i;
506
507         tcp_delete_keepalive_timer(sk);
508
509         /* make all the listen_opt local to us */
510         lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
511         acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513         if (lopt->qlen) {
514                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
515                         while ((req = lopt->syn_table[i]) != NULL) {
516                                 lopt->syn_table[i] = req->dl_next;
517                                 lopt->qlen--;
518                                 reqsk_free(req);
519
520                 /* Following specs, it would be better either to send FIN
521                  * (and enter FIN-WAIT-1, it is normal close)
522                  * or to send active reset (abort).
523                  * Certainly, it is pretty dangerous while synflood, but it is
524                  * bad justification for our negligence 8)
525                  * To be honest, we are not able to make either
526                  * of the variants now.                 --ANK
527                  */
528                         }
529                 }
530         }
531         BUG_TRAP(!lopt->qlen);
532
533         kfree(lopt);
534
535         while ((req = acc_req) != NULL) {
536                 struct sock *child = req->sk;
537
538                 acc_req = req->dl_next;
539
540                 local_bh_disable();
541                 bh_lock_sock(child);
542                 BUG_TRAP(!sock_owned_by_user(child));
543                 sock_hold(child);
544
545                 tcp_disconnect(child, O_NONBLOCK);
546
547                 sock_orphan(child);
548
549                 atomic_inc(&tcp_orphan_count);
550
551                 tcp_destroy_sock(child);
552
553                 bh_unlock_sock(child);
554                 local_bh_enable();
555                 sock_put(child);
556
557                 sk_acceptq_removed(sk);
558                 __reqsk_free(req);
559         }
560         BUG_TRAP(!sk->sk_ack_backlog);
561 }
562
563 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
564 {
565         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
566         tp->pushed_seq = tp->write_seq;
567 }
568
569 static inline int forced_push(struct tcp_sock *tp)
570 {
571         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
572 }
573
574 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
575                               struct sk_buff *skb)
576 {
577         skb->csum = 0;
578         TCP_SKB_CB(skb)->seq = tp->write_seq;
579         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
580         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
581         TCP_SKB_CB(skb)->sacked = 0;
582         skb_header_release(skb);
583         __skb_queue_tail(&sk->sk_write_queue, skb);
584         sk_charge_skb(sk, skb);
585         if (!sk->sk_send_head)
586                 sk->sk_send_head = skb;
587         else if (tp->nonagle&TCP_NAGLE_PUSH)
588                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
589 }
590
591 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
592                                 struct sk_buff *skb)
593 {
594         if (flags & MSG_OOB) {
595                 tp->urg_mode = 1;
596                 tp->snd_up = tp->write_seq;
597                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
598         }
599 }
600
601 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
602                             int mss_now, int nonagle)
603 {
604         if (sk->sk_send_head) {
605                 struct sk_buff *skb = sk->sk_write_queue.prev;
606                 if (!(flags & MSG_MORE) || forced_push(tp))
607                         tcp_mark_push(tp, skb);
608                 tcp_mark_urg(tp, flags, skb);
609                 __tcp_push_pending_frames(sk, tp, mss_now,
610                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
611         }
612 }
613
614 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
615                          size_t psize, int flags)
616 {
617         struct tcp_sock *tp = tcp_sk(sk);
618         int mss_now;
619         int err;
620         ssize_t copied;
621         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
622
623         /* Wait for a connection to finish. */
624         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
625                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
626                         goto out_err;
627
628         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629
630         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631         copied = 0;
632
633         err = -EPIPE;
634         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
635                 goto do_error;
636
637         while (psize > 0) {
638                 struct sk_buff *skb = sk->sk_write_queue.prev;
639                 struct page *page = pages[poffset / PAGE_SIZE];
640                 int copy, i, can_coalesce;
641                 int offset = poffset % PAGE_SIZE;
642                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643
644                 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
645 new_segment:
646                         if (!sk_stream_memory_free(sk))
647                                 goto wait_for_sndbuf;
648
649                         skb = sk_stream_alloc_pskb(sk, 0, 0,
650                                                    sk->sk_allocation);
651                         if (!skb)
652                                 goto wait_for_memory;
653
654                         skb_entail(sk, tp, skb);
655                         copy = mss_now;
656                 }
657
658                 if (copy > size)
659                         copy = size;
660
661                 i = skb_shinfo(skb)->nr_frags;
662                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
663                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
664                         tcp_mark_push(tp, skb);
665                         goto new_segment;
666                 }
667                 if (sk->sk_forward_alloc < copy &&
668                     !sk_stream_mem_schedule(sk, copy, 0))
669                         goto wait_for_memory;
670                 
671                 if (can_coalesce) {
672                         skb_shinfo(skb)->frags[i - 1].size += copy;
673                 } else {
674                         get_page(page);
675                         skb_fill_page_desc(skb, i, page, offset, copy);
676                 }
677
678                 skb->len += copy;
679                 skb->data_len += copy;
680                 skb->truesize += copy;
681                 sk->sk_wmem_queued += copy;
682                 sk->sk_forward_alloc -= copy;
683                 skb->ip_summed = CHECKSUM_HW;
684                 tp->write_seq += copy;
685                 TCP_SKB_CB(skb)->end_seq += copy;
686                 skb_shinfo(skb)->tso_segs = 0;
687
688                 if (!copied)
689                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
690
691                 copied += copy;
692                 poffset += copy;
693                 if (!(psize -= copy))
694                         goto out;
695
696                 if (skb->len != mss_now || (flags & MSG_OOB))
697                         continue;
698
699                 if (forced_push(tp)) {
700                         tcp_mark_push(tp, skb);
701                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
702                 } else if (skb == sk->sk_send_head)
703                         tcp_push_one(sk, mss_now);
704                 continue;
705
706 wait_for_sndbuf:
707                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
708 wait_for_memory:
709                 if (copied)
710                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
711
712                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
713                         goto do_error;
714
715                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
716         }
717
718 out:
719         if (copied)
720                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
721         return copied;
722
723 do_error:
724         if (copied)
725                 goto out;
726 out_err:
727         return sk_stream_error(sk, flags, err);
728 }
729
730 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
731                      size_t size, int flags)
732 {
733         ssize_t res;
734         struct sock *sk = sock->sk;
735
736 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
737
738         if (!(sk->sk_route_caps & NETIF_F_SG) ||
739             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
740                 return sock_no_sendpage(sock, page, offset, size, flags);
741
742 #undef TCP_ZC_CSUM_FLAGS
743
744         lock_sock(sk);
745         TCP_CHECK_TIMER(sk);
746         res = do_tcp_sendpages(sk, &page, offset, size, flags);
747         TCP_CHECK_TIMER(sk);
748         release_sock(sk);
749         return res;
750 }
751
752 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
753 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
754
755 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756 {
757         int tmp = tp->mss_cache_std;
758
759         if (sk->sk_route_caps & NETIF_F_SG) {
760                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761
762                 if (tmp >= pgbreak &&
763                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764                         tmp = pgbreak;
765         }
766         return tmp;
767 }
768
769 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
770                 size_t size)
771 {
772         struct iovec *iov;
773         struct tcp_sock *tp = tcp_sk(sk);
774         struct sk_buff *skb;
775         int iovlen, flags;
776         int mss_now;
777         int err, copied;
778         long timeo;
779
780         lock_sock(sk);
781         TCP_CHECK_TIMER(sk);
782
783         flags = msg->msg_flags;
784         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
785
786         /* Wait for a connection to finish. */
787         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
788                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
789                         goto out_err;
790
791         /* This should be in poll */
792         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793
794         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
795
796         /* Ok commence sending. */
797         iovlen = msg->msg_iovlen;
798         iov = msg->msg_iov;
799         copied = 0;
800
801         err = -EPIPE;
802         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
803                 goto do_error;
804
805         while (--iovlen >= 0) {
806                 int seglen = iov->iov_len;
807                 unsigned char __user *from = iov->iov_base;
808
809                 iov++;
810
811                 while (seglen > 0) {
812                         int copy;
813
814                         skb = sk->sk_write_queue.prev;
815
816                         if (!sk->sk_send_head ||
817                             (copy = mss_now - skb->len) <= 0) {
818
819 new_segment:
820                                 /* Allocate new segment. If the interface is SG,
821                                  * allocate skb fitting to single page.
822                                  */
823                                 if (!sk_stream_memory_free(sk))
824                                         goto wait_for_sndbuf;
825
826                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
827                                                            0, sk->sk_allocation);
828                                 if (!skb)
829                                         goto wait_for_memory;
830
831                                 /*
832                                  * Check whether we can use HW checksum.
833                                  */
834                                 if (sk->sk_route_caps &
835                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
836                                      NETIF_F_HW_CSUM))
837                                         skb->ip_summed = CHECKSUM_HW;
838
839                                 skb_entail(sk, tp, skb);
840                                 copy = mss_now;
841                         }
842
843                         /* Try to append data to the end of skb. */
844                         if (copy > seglen)
845                                 copy = seglen;
846
847                         /* Where to copy to? */
848                         if (skb_tailroom(skb) > 0) {
849                                 /* We have some space in skb head. Superb! */
850                                 if (copy > skb_tailroom(skb))
851                                         copy = skb_tailroom(skb);
852                                 if ((err = skb_add_data(skb, from, copy)) != 0)
853                                         goto do_fault;
854                         } else {
855                                 int merge = 0;
856                                 int i = skb_shinfo(skb)->nr_frags;
857                                 struct page *page = TCP_PAGE(sk);
858                                 int off = TCP_OFF(sk);
859
860                                 if (skb_can_coalesce(skb, i, page, off) &&
861                                     off != PAGE_SIZE) {
862                                         /* We can extend the last page
863                                          * fragment. */
864                                         merge = 1;
865                                 } else if (i == MAX_SKB_FRAGS ||
866                                            (!i &&
867                                            !(sk->sk_route_caps & NETIF_F_SG))) {
868                                         /* Need to add new fragment and cannot
869                                          * do this because interface is non-SG,
870                                          * or because all the page slots are
871                                          * busy. */
872                                         tcp_mark_push(tp, skb);
873                                         goto new_segment;
874                                 } else if (page) {
875                                         /* If page is cached, align
876                                          * offset to L1 cache boundary
877                                          */
878                                         off = (off + L1_CACHE_BYTES - 1) &
879                                               ~(L1_CACHE_BYTES - 1);
880                                         if (off == PAGE_SIZE) {
881                                                 put_page(page);
882                                                 TCP_PAGE(sk) = page = NULL;
883                                         }
884                                 }
885
886                                 if (!page) {
887                                         /* Allocate new cache page. */
888                                         if (!(page = sk_stream_alloc_page(sk)))
889                                                 goto wait_for_memory;
890                                         off = 0;
891                                 }
892
893                                 if (copy > PAGE_SIZE - off)
894                                         copy = PAGE_SIZE - off;
895
896                                 /* Time to copy data. We are close to
897                                  * the end! */
898                                 err = skb_copy_to_page(sk, from, skb, page,
899                                                        off, copy);
900                                 if (err) {
901                                         /* If this page was new, give it to the
902                                          * socket so it does not get leaked.
903                                          */
904                                         if (!TCP_PAGE(sk)) {
905                                                 TCP_PAGE(sk) = page;
906                                                 TCP_OFF(sk) = 0;
907                                         }
908                                         goto do_error;
909                                 }
910
911                                 /* Update the skb. */
912                                 if (merge) {
913                                         skb_shinfo(skb)->frags[i - 1].size +=
914                                                                         copy;
915                                 } else {
916                                         skb_fill_page_desc(skb, i, page, off, copy);
917                                         if (TCP_PAGE(sk)) {
918                                                 get_page(page);
919                                         } else if (off + copy < PAGE_SIZE) {
920                                                 get_page(page);
921                                                 TCP_PAGE(sk) = page;
922                                         }
923                                 }
924
925                                 TCP_OFF(sk) = off + copy;
926                         }
927
928                         if (!copied)
929                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
930
931                         tp->write_seq += copy;
932                         TCP_SKB_CB(skb)->end_seq += copy;
933                         skb_shinfo(skb)->tso_segs = 0;
934
935                         from += copy;
936                         copied += copy;
937                         if ((seglen -= copy) == 0 && iovlen == 0)
938                                 goto out;
939
940                         if (skb->len != mss_now || (flags & MSG_OOB))
941                                 continue;
942
943                         if (forced_push(tp)) {
944                                 tcp_mark_push(tp, skb);
945                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
946                         } else if (skb == sk->sk_send_head)
947                                 tcp_push_one(sk, mss_now);
948                         continue;
949
950 wait_for_sndbuf:
951                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
952 wait_for_memory:
953                         if (copied)
954                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
955
956                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
957                                 goto do_error;
958
959                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
960                 }
961         }
962
963 out:
964         if (copied)
965                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
966         TCP_CHECK_TIMER(sk);
967         release_sock(sk);
968         return copied;
969
970 do_fault:
971         if (!skb->len) {
972                 if (sk->sk_send_head == skb)
973                         sk->sk_send_head = NULL;
974                 __skb_unlink(skb, skb->list);
975                 sk_stream_free_skb(sk, skb);
976         }
977
978 do_error:
979         if (copied)
980                 goto out;
981 out_err:
982         err = sk_stream_error(sk, flags, err);
983         TCP_CHECK_TIMER(sk);
984         release_sock(sk);
985         return err;
986 }
987
988 /*
989  *      Handle reading urgent data. BSD has very simple semantics for
990  *      this, no blocking and very strange errors 8)
991  */
992
993 static int tcp_recv_urg(struct sock *sk, long timeo,
994                         struct msghdr *msg, int len, int flags,
995                         int *addr_len)
996 {
997         struct tcp_sock *tp = tcp_sk(sk);
998
999         /* No URG data to read. */
1000         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1001             tp->urg_data == TCP_URG_READ)
1002                 return -EINVAL; /* Yes this is right ! */
1003
1004         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1005                 return -ENOTCONN;
1006
1007         if (tp->urg_data & TCP_URG_VALID) {
1008                 int err = 0;
1009                 char c = tp->urg_data;
1010
1011                 if (!(flags & MSG_PEEK))
1012                         tp->urg_data = TCP_URG_READ;
1013
1014                 /* Read urgent data. */
1015                 msg->msg_flags |= MSG_OOB;
1016
1017                 if (len > 0) {
1018                         if (!(flags & MSG_TRUNC))
1019                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1020                         len = 1;
1021                 } else
1022                         msg->msg_flags |= MSG_TRUNC;
1023
1024                 return err ? -EFAULT : len;
1025         }
1026
1027         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1028                 return 0;
1029
1030         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1031          * the available implementations agree in this case:
1032          * this call should never block, independent of the
1033          * blocking state of the socket.
1034          * Mike <pall@rz.uni-karlsruhe.de>
1035          */
1036         return -EAGAIN;
1037 }
1038
1039 /* Clean up the receive buffer for full frames taken by the user,
1040  * then send an ACK if necessary.  COPIED is the number of bytes
1041  * tcp_recvmsg has given to the user so far, it speeds up the
1042  * calculation of whether or not we must ACK for the sake of
1043  * a window update.
1044  */
1045 static void cleanup_rbuf(struct sock *sk, int copied)
1046 {
1047         struct tcp_sock *tp = tcp_sk(sk);
1048         int time_to_ack = 0;
1049
1050 #if TCP_DEBUG
1051         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1052
1053         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1054 #endif
1055
1056         if (tcp_ack_scheduled(tp)) {
1057                    /* Delayed ACKs frequently hit locked sockets during bulk
1058                     * receive. */
1059                 if (tp->ack.blocked ||
1060                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1061                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1062                     /*
1063                      * If this read emptied read buffer, we send ACK, if
1064                      * connection is not bidirectional, user drained
1065                      * receive buffer and there was a small segment
1066                      * in queue.
1067                      */
1068                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1069                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1070                         time_to_ack = 1;
1071         }
1072
1073         /* We send an ACK if we can now advertise a non-zero window
1074          * which has been raised "significantly".
1075          *
1076          * Even if window raised up to infinity, do not send window open ACK
1077          * in states, where we will not receive more. It is useless.
1078          */
1079         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1080                 __u32 rcv_window_now = tcp_receive_window(tp);
1081
1082                 /* Optimize, __tcp_select_window() is not cheap. */
1083                 if (2*rcv_window_now <= tp->window_clamp) {
1084                         __u32 new_window = __tcp_select_window(sk);
1085
1086                         /* Send ACK now, if this read freed lots of space
1087                          * in our buffer. Certainly, new_window is new window.
1088                          * We can advertise it now, if it is not less than current one.
1089                          * "Lots" means "at least twice" here.
1090                          */
1091                         if (new_window && new_window >= 2 * rcv_window_now)
1092                                 time_to_ack = 1;
1093                 }
1094         }
1095         if (time_to_ack)
1096                 tcp_send_ack(sk);
1097 }
1098
1099 static void tcp_prequeue_process(struct sock *sk)
1100 {
1101         struct sk_buff *skb;
1102         struct tcp_sock *tp = tcp_sk(sk);
1103
1104         NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
1105
1106         /* RX process wants to run with disabled BHs, though it is not
1107          * necessary */
1108         local_bh_disable();
1109         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1110                 sk->sk_backlog_rcv(sk, skb);
1111         local_bh_enable();
1112
1113         /* Clear memory counter. */
1114         tp->ucopy.memory = 0;
1115 }
1116
1117 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1118 {
1119         struct sk_buff *skb;
1120         u32 offset;
1121
1122         skb_queue_walk(&sk->sk_receive_queue, skb) {
1123                 offset = seq - TCP_SKB_CB(skb)->seq;
1124                 if (skb->h.th->syn)
1125                         offset--;
1126                 if (offset < skb->len || skb->h.th->fin) {
1127                         *off = offset;
1128                         return skb;
1129                 }
1130         }
1131         return NULL;
1132 }
1133
1134 /*
1135  * This routine provides an alternative to tcp_recvmsg() for routines
1136  * that would like to handle copying from skbuffs directly in 'sendfile'
1137  * fashion.
1138  * Note:
1139  *      - It is assumed that the socket was locked by the caller.
1140  *      - The routine does not block.
1141  *      - At present, there is no support for reading OOB data
1142  *        or for 'peeking' the socket using this routine
1143  *        (although both would be easy to implement).
1144  */
1145 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1146                   sk_read_actor_t recv_actor)
1147 {
1148         struct sk_buff *skb;
1149         struct tcp_sock *tp = tcp_sk(sk);
1150         u32 seq = tp->copied_seq;
1151         u32 offset;
1152         int copied = 0;
1153
1154         if (sk->sk_state == TCP_LISTEN)
1155                 return -ENOTCONN;
1156         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1157                 if (offset < skb->len) {
1158                         size_t used, len;
1159
1160                         len = skb->len - offset;
1161                         /* Stop reading if we hit a patch of urgent data */
1162                         if (tp->urg_data) {
1163                                 u32 urg_offset = tp->urg_seq - seq;
1164                                 if (urg_offset < len)
1165                                         len = urg_offset;
1166                                 if (!len)
1167                                         break;
1168                         }
1169                         used = recv_actor(desc, skb, offset, len);
1170                         if (used <= len) {
1171                                 seq += used;
1172                                 copied += used;
1173                                 offset += used;
1174                         }
1175                         if (offset != skb->len)
1176                                 break;
1177                 }
1178                 if (skb->h.th->fin) {
1179                         sk_eat_skb(sk, skb);
1180                         ++seq;
1181                         break;
1182                 }
1183                 sk_eat_skb(sk, skb);
1184                 if (!desc->count)
1185                         break;
1186         }
1187         tp->copied_seq = seq;
1188
1189         tcp_rcv_space_adjust(sk);
1190
1191         /* Clean up data we have read: This will do ACK frames. */
1192         if (copied)
1193                 cleanup_rbuf(sk, copied);
1194         return copied;
1195 }
1196
1197 /*
1198  *      This routine copies from a sock struct into the user buffer.
1199  *
1200  *      Technical note: in 2.3 we work on _locked_ socket, so that
1201  *      tricks with *seq access order and skb->users are not required.
1202  *      Probably, code can be easily improved even more.
1203  */
1204
1205 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1206                 size_t len, int nonblock, int flags, int *addr_len)
1207 {
1208         struct tcp_sock *tp = tcp_sk(sk);
1209         int copied = 0;
1210         u32 peek_seq;
1211         u32 *seq;
1212         unsigned long used;
1213         int err;
1214         int target;             /* Read at least this many bytes */
1215         long timeo;
1216         struct task_struct *user_recv = NULL;
1217
1218         lock_sock(sk);
1219
1220         TCP_CHECK_TIMER(sk);
1221
1222         err = -ENOTCONN;
1223         if (sk->sk_state == TCP_LISTEN)
1224                 goto out;
1225
1226         timeo = sock_rcvtimeo(sk, nonblock);
1227
1228         /* Urgent data needs to be handled specially. */
1229         if (flags & MSG_OOB)
1230                 goto recv_urg;
1231
1232         seq = &tp->copied_seq;
1233         if (flags & MSG_PEEK) {
1234                 peek_seq = tp->copied_seq;
1235                 seq = &peek_seq;
1236         }
1237
1238         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1239
1240         do {
1241                 struct sk_buff *skb;
1242                 u32 offset;
1243
1244                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1245                 if (tp->urg_data && tp->urg_seq == *seq) {
1246                         if (copied)
1247                                 break;
1248                         if (signal_pending(current)) {
1249                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1250                                 break;
1251                         }
1252                 }
1253
1254                 /* Next get a buffer. */
1255
1256                 skb = skb_peek(&sk->sk_receive_queue);
1257                 do {
1258                         if (!skb)
1259                                 break;
1260
1261                         /* Now that we have two receive queues this
1262                          * shouldn't happen.
1263                          */
1264                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1265                                 printk(KERN_INFO "recvmsg bug: copied %X "
1266                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1267                                 break;
1268                         }
1269                         offset = *seq - TCP_SKB_CB(skb)->seq;
1270                         if (skb->h.th->syn)
1271                                 offset--;
1272                         if (offset < skb->len)
1273                                 goto found_ok_skb;
1274                         if (skb->h.th->fin)
1275                                 goto found_fin_ok;
1276                         BUG_TRAP(flags & MSG_PEEK);
1277                         skb = skb->next;
1278                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1279
1280                 /* Well, if we have backlog, try to process it now yet. */
1281
1282                 if (copied >= target && !sk->sk_backlog.tail)
1283                         break;
1284
1285                 if (copied) {
1286                         if (sk->sk_err ||
1287                             sk->sk_state == TCP_CLOSE ||
1288                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1289                             !timeo ||
1290                             signal_pending(current) ||
1291                             (flags & MSG_PEEK))
1292                                 break;
1293                 } else {
1294                         if (sock_flag(sk, SOCK_DONE))
1295                                 break;
1296
1297                         if (sk->sk_err) {
1298                                 copied = sock_error(sk);
1299                                 break;
1300                         }
1301
1302                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1303                                 break;
1304
1305                         if (sk->sk_state == TCP_CLOSE) {
1306                                 if (!sock_flag(sk, SOCK_DONE)) {
1307                                         /* This occurs when user tries to read
1308                                          * from never connected socket.
1309                                          */
1310                                         copied = -ENOTCONN;
1311                                         break;
1312                                 }
1313                                 break;
1314                         }
1315
1316                         if (!timeo) {
1317                                 copied = -EAGAIN;
1318                                 break;
1319                         }
1320
1321                         if (signal_pending(current)) {
1322                                 copied = sock_intr_errno(timeo);
1323                                 break;
1324                         }
1325                 }
1326
1327                 cleanup_rbuf(sk, copied);
1328
1329                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1330                         /* Install new reader */
1331                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1332                                 user_recv = current;
1333                                 tp->ucopy.task = user_recv;
1334                                 tp->ucopy.iov = msg->msg_iov;
1335                         }
1336
1337                         tp->ucopy.len = len;
1338
1339                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1340                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1341
1342                         /* Ugly... If prequeue is not empty, we have to
1343                          * process it before releasing socket, otherwise
1344                          * order will be broken at second iteration.
1345                          * More elegant solution is required!!!
1346                          *
1347                          * Look: we have the following (pseudo)queues:
1348                          *
1349                          * 1. packets in flight
1350                          * 2. backlog
1351                          * 3. prequeue
1352                          * 4. receive_queue
1353                          *
1354                          * Each queue can be processed only if the next ones
1355                          * are empty. At this point we have empty receive_queue.
1356                          * But prequeue _can_ be not empty after 2nd iteration,
1357                          * when we jumped to start of loop because backlog
1358                          * processing added something to receive_queue.
1359                          * We cannot release_sock(), because backlog contains
1360                          * packets arrived _after_ prequeued ones.
1361                          *
1362                          * Shortly, algorithm is clear --- to process all
1363                          * the queues in order. We could make it more directly,
1364                          * requeueing packets from backlog to prequeue, if
1365                          * is not empty. It is more elegant, but eats cycles,
1366                          * unfortunately.
1367                          */
1368                         if (skb_queue_len(&tp->ucopy.prequeue))
1369                                 goto do_prequeue;
1370
1371                         /* __ Set realtime policy in scheduler __ */
1372                 }
1373
1374                 if (copied >= target) {
1375                         /* Do not sleep, just process backlog. */
1376                         release_sock(sk);
1377                         lock_sock(sk);
1378                 } else
1379                         sk_wait_data(sk, &timeo);
1380
1381                 if (user_recv) {
1382                         int chunk;
1383
1384                         /* __ Restore normal policy in scheduler __ */
1385
1386                         if ((chunk = len - tp->ucopy.len) != 0) {
1387                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1388                                 len -= chunk;
1389                                 copied += chunk;
1390                         }
1391
1392                         if (tp->rcv_nxt == tp->copied_seq &&
1393                             skb_queue_len(&tp->ucopy.prequeue)) {
1394 do_prequeue:
1395                                 tcp_prequeue_process(sk);
1396
1397                                 if ((chunk = len - tp->ucopy.len) != 0) {
1398                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1399                                         len -= chunk;
1400                                         copied += chunk;
1401                                 }
1402                         }
1403                 }
1404                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1405                         if (net_ratelimit())
1406                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1407                                        current->comm, current->pid);
1408                         peek_seq = tp->copied_seq;
1409                 }
1410                 continue;
1411
1412         found_ok_skb:
1413                 /* Ok so how much can we use? */
1414                 used = skb->len - offset;
1415                 if (len < used)
1416                         used = len;
1417
1418                 /* Do we have urgent data here? */
1419                 if (tp->urg_data) {
1420                         u32 urg_offset = tp->urg_seq - *seq;
1421                         if (urg_offset < used) {
1422                                 if (!urg_offset) {
1423                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1424                                                 ++*seq;
1425                                                 offset++;
1426                                                 used--;
1427                                                 if (!used)
1428                                                         goto skip_copy;
1429                                         }
1430                                 } else
1431                                         used = urg_offset;
1432                         }
1433                 }
1434
1435                 if (!(flags & MSG_TRUNC)) {
1436                         err = skb_copy_datagram_iovec(skb, offset,
1437                                                       msg->msg_iov, used);
1438                         if (err) {
1439                                 /* Exception. Bailout! */
1440                                 if (!copied)
1441                                         copied = -EFAULT;
1442                                 break;
1443                         }
1444                 }
1445
1446                 *seq += used;
1447                 copied += used;
1448                 len -= used;
1449
1450                 tcp_rcv_space_adjust(sk);
1451
1452 skip_copy:
1453                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1454                         tp->urg_data = 0;
1455                         tcp_fast_path_check(sk, tp);
1456                 }
1457                 if (used + offset < skb->len)
1458                         continue;
1459
1460                 if (skb->h.th->fin)
1461                         goto found_fin_ok;
1462                 if (!(flags & MSG_PEEK))
1463                         sk_eat_skb(sk, skb);
1464                 continue;
1465
1466         found_fin_ok:
1467                 /* Process the FIN. */
1468                 ++*seq;
1469                 if (!(flags & MSG_PEEK))
1470                         sk_eat_skb(sk, skb);
1471                 break;
1472         } while (len > 0);
1473
1474         if (user_recv) {
1475                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1476                         int chunk;
1477
1478                         tp->ucopy.len = copied > 0 ? len : 0;
1479
1480                         tcp_prequeue_process(sk);
1481
1482                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1483                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1484                                 len -= chunk;
1485                                 copied += chunk;
1486                         }
1487                 }
1488
1489                 tp->ucopy.task = NULL;
1490                 tp->ucopy.len = 0;
1491         }
1492
1493         /* According to UNIX98, msg_name/msg_namelen are ignored
1494          * on connected socket. I was just happy when found this 8) --ANK
1495          */
1496
1497         /* Clean up data we have read: This will do ACK frames. */
1498         cleanup_rbuf(sk, copied);
1499
1500         TCP_CHECK_TIMER(sk);
1501         release_sock(sk);
1502         return copied;
1503
1504 out:
1505         TCP_CHECK_TIMER(sk);
1506         release_sock(sk);
1507         return err;
1508
1509 recv_urg:
1510         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1511         goto out;
1512 }
1513
1514 /*
1515  *      State processing on a close. This implements the state shift for
1516  *      sending our FIN frame. Note that we only send a FIN for some
1517  *      states. A shutdown() may have already sent the FIN, or we may be
1518  *      closed.
1519  */
1520
1521 static unsigned char new_state[16] = {
1522   /* current state:        new state:      action:      */
1523   /* (Invalid)          */ TCP_CLOSE,
1524   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1525   /* TCP_SYN_SENT       */ TCP_CLOSE,
1526   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1527   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1528   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1529   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1530   /* TCP_CLOSE          */ TCP_CLOSE,
1531   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1532   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1533   /* TCP_LISTEN         */ TCP_CLOSE,
1534   /* TCP_CLOSING        */ TCP_CLOSING,
1535 };
1536
1537 static int tcp_close_state(struct sock *sk)
1538 {
1539         int next = (int)new_state[sk->sk_state];
1540         int ns = next & TCP_STATE_MASK;
1541
1542         tcp_set_state(sk, ns);
1543
1544         return next & TCP_ACTION_FIN;
1545 }
1546
1547 /*
1548  *      Shutdown the sending side of a connection. Much like close except
1549  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1550  */
1551
1552 void tcp_shutdown(struct sock *sk, int how)
1553 {
1554         /*      We need to grab some memory, and put together a FIN,
1555          *      and then put it into the queue to be sent.
1556          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1557          */
1558         if (!(how & SEND_SHUTDOWN))
1559                 return;
1560
1561         /* If we've already sent a FIN, or it's a closed state, skip this. */
1562         if ((1 << sk->sk_state) &
1563             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1564              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1565                 /* Clear out any half completed packets.  FIN if needed. */
1566                 if (tcp_close_state(sk))
1567                         tcp_send_fin(sk);
1568         }
1569 }
1570
1571 /*
1572  * At this point, there should be no process reference to this
1573  * socket, and thus no user references at all.  Therefore we
1574  * can assume the socket waitqueue is inactive and nobody will
1575  * try to jump onto it.
1576  */
1577 void tcp_destroy_sock(struct sock *sk)
1578 {
1579         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1580         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1581
1582         /* It cannot be in hash table! */
1583         BUG_TRAP(sk_unhashed(sk));
1584
1585         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1586         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1587
1588         sk->sk_prot->destroy(sk);
1589
1590         sk_stream_kill_queues(sk);
1591
1592         xfrm_sk_free_policy(sk);
1593
1594 #ifdef INET_REFCNT_DEBUG
1595         if (atomic_read(&sk->sk_refcnt) != 1) {
1596                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1597                        sk, atomic_read(&sk->sk_refcnt));
1598         }
1599 #endif
1600
1601         atomic_dec(&tcp_orphan_count);
1602         sock_put(sk);
1603 }
1604
1605 void tcp_close(struct sock *sk, long timeout)
1606 {
1607         struct sk_buff *skb;
1608         int data_was_unread = 0;
1609
1610         lock_sock(sk);
1611         sk->sk_shutdown = SHUTDOWN_MASK;
1612
1613         if (sk->sk_state == TCP_LISTEN) {
1614                 tcp_set_state(sk, TCP_CLOSE);
1615
1616                 /* Special case. */
1617                 tcp_listen_stop(sk);
1618
1619                 goto adjudge_to_death;
1620         }
1621
1622         /*  We need to flush the recv. buffs.  We do this only on the
1623          *  descriptor close, not protocol-sourced closes, because the
1624          *  reader process may not have drained the data yet!
1625          */
1626         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1627                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1628                           skb->h.th->fin;
1629                 data_was_unread += len;
1630                 __kfree_skb(skb);
1631         }
1632
1633         sk_stream_mem_reclaim(sk);
1634
1635         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1636          * 3.10, we send a RST here because data was lost.  To
1637          * witness the awful effects of the old behavior of always
1638          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1639          * a bulk GET in an FTP client, suspend the process, wait
1640          * for the client to advertise a zero window, then kill -9
1641          * the FTP client, wheee...  Note: timeout is always zero
1642          * in such a case.
1643          */
1644         if (data_was_unread) {
1645                 /* Unread data was tossed, zap the connection. */
1646                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1647                 tcp_set_state(sk, TCP_CLOSE);
1648                 tcp_send_active_reset(sk, GFP_KERNEL);
1649         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1650                 /* Check zero linger _after_ checking for unread data. */
1651                 sk->sk_prot->disconnect(sk, 0);
1652                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1653         } else if (tcp_close_state(sk)) {
1654                 /* We FIN if the application ate all the data before
1655                  * zapping the connection.
1656                  */
1657
1658                 /* RED-PEN. Formally speaking, we have broken TCP state
1659                  * machine. State transitions:
1660                  *
1661                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1662                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1663                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1664                  *
1665                  * are legal only when FIN has been sent (i.e. in window),
1666                  * rather than queued out of window. Purists blame.
1667                  *
1668                  * F.e. "RFC state" is ESTABLISHED,
1669                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1670                  *
1671                  * The visible declinations are that sometimes
1672                  * we enter time-wait state, when it is not required really
1673                  * (harmless), do not send active resets, when they are
1674                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1675                  * they look as CLOSING or LAST_ACK for Linux)
1676                  * Probably, I missed some more holelets.
1677                  *                                              --ANK
1678                  */
1679                 tcp_send_fin(sk);
1680         }
1681
1682         sk_stream_wait_close(sk, timeout);
1683
1684 adjudge_to_death:
1685         /* It is the last release_sock in its life. It will remove backlog. */
1686         release_sock(sk);
1687
1688
1689         /* Now socket is owned by kernel and we acquire BH lock
1690            to finish close. No need to check for user refs.
1691          */
1692         local_bh_disable();
1693         bh_lock_sock(sk);
1694         BUG_TRAP(!sock_owned_by_user(sk));
1695
1696         sock_hold(sk);
1697         sock_orphan(sk);
1698
1699         /*      This is a (useful) BSD violating of the RFC. There is a
1700          *      problem with TCP as specified in that the other end could
1701          *      keep a socket open forever with no application left this end.
1702          *      We use a 3 minute timeout (about the same as BSD) then kill
1703          *      our end. If they send after that then tough - BUT: long enough
1704          *      that we won't make the old 4*rto = almost no time - whoops
1705          *      reset mistake.
1706          *
1707          *      Nope, it was not mistake. It is really desired behaviour
1708          *      f.e. on http servers, when such sockets are useless, but
1709          *      consume significant resources. Let's do it with special
1710          *      linger2 option.                                 --ANK
1711          */
1712
1713         if (sk->sk_state == TCP_FIN_WAIT2) {
1714                 struct tcp_sock *tp = tcp_sk(sk);
1715                 if (tp->linger2 < 0) {
1716                         tcp_set_state(sk, TCP_CLOSE);
1717                         tcp_send_active_reset(sk, GFP_ATOMIC);
1718                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1719                 } else {
1720                         int tmo = tcp_fin_time(tp);
1721
1722                         if (tmo > TCP_TIMEWAIT_LEN) {
1723                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1724                         } else {
1725                                 atomic_inc(&tcp_orphan_count);
1726                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1727                                 goto out;
1728                         }
1729                 }
1730         }
1731         if (sk->sk_state != TCP_CLOSE) {
1732                 sk_stream_mem_reclaim(sk);
1733                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1734                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1735                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1736                         if (net_ratelimit())
1737                                 printk(KERN_INFO "TCP: too many of orphaned "
1738                                        "sockets\n");
1739                         tcp_set_state(sk, TCP_CLOSE);
1740                         tcp_send_active_reset(sk, GFP_ATOMIC);
1741                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1742                 }
1743         }
1744         atomic_inc(&tcp_orphan_count);
1745
1746         if (sk->sk_state == TCP_CLOSE)
1747                 tcp_destroy_sock(sk);
1748         /* Otherwise, socket is reprieved until protocol close. */
1749
1750 out:
1751         bh_unlock_sock(sk);
1752         local_bh_enable();
1753         sock_put(sk);
1754 }
1755
1756 /* These states need RST on ABORT according to RFC793 */
1757
1758 static inline int tcp_need_reset(int state)
1759 {
1760         return (1 << state) &
1761                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1762                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1763 }
1764
1765 int tcp_disconnect(struct sock *sk, int flags)
1766 {
1767         struct inet_sock *inet = inet_sk(sk);
1768         struct tcp_sock *tp = tcp_sk(sk);
1769         int err = 0;
1770         int old_state = sk->sk_state;
1771
1772         if (old_state != TCP_CLOSE)
1773                 tcp_set_state(sk, TCP_CLOSE);
1774
1775         /* ABORT function of RFC793 */
1776         if (old_state == TCP_LISTEN) {
1777                 tcp_listen_stop(sk);
1778         } else if (tcp_need_reset(old_state) ||
1779                    (tp->snd_nxt != tp->write_seq &&
1780                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1781                 /* The last check adjusts for discrepance of Linux wrt. RFC
1782                  * states
1783                  */
1784                 tcp_send_active_reset(sk, gfp_any());
1785                 sk->sk_err = ECONNRESET;
1786         } else if (old_state == TCP_SYN_SENT)
1787                 sk->sk_err = ECONNRESET;
1788
1789         tcp_clear_xmit_timers(sk);
1790         __skb_queue_purge(&sk->sk_receive_queue);
1791         sk_stream_writequeue_purge(sk);
1792         __skb_queue_purge(&tp->out_of_order_queue);
1793
1794         inet->dport = 0;
1795
1796         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1797                 inet_reset_saddr(sk);
1798
1799         sk->sk_shutdown = 0;
1800         sock_reset_flag(sk, SOCK_DONE);
1801         tp->srtt = 0;
1802         if ((tp->write_seq += tp->max_window + 2) == 0)
1803                 tp->write_seq = 1;
1804         tp->backoff = 0;
1805         tp->snd_cwnd = 2;
1806         tp->probes_out = 0;
1807         tp->packets_out = 0;
1808         tp->snd_ssthresh = 0x7fffffff;
1809         tp->snd_cwnd_cnt = 0;
1810         tcp_set_ca_state(tp, TCP_CA_Open);
1811         tcp_clear_retrans(tp);
1812         tcp_delack_init(tp);
1813         sk->sk_send_head = NULL;
1814         tp->rx_opt.saw_tstamp = 0;
1815         tcp_sack_reset(&tp->rx_opt);
1816         __sk_dst_reset(sk);
1817
1818         BUG_TRAP(!inet->num || tp->bind_hash);
1819
1820         sk->sk_error_report(sk);
1821         return err;
1822 }
1823
1824 /*
1825  *      Wait for an incoming connection, avoid race
1826  *      conditions. This must be called with the socket locked.
1827  */
1828 static int wait_for_connect(struct sock *sk, long timeo)
1829 {
1830         struct tcp_sock *tp = tcp_sk(sk);
1831         DEFINE_WAIT(wait);
1832         int err;
1833
1834         /*
1835          * True wake-one mechanism for incoming connections: only
1836          * one process gets woken up, not the 'whole herd'.
1837          * Since we do not 'race & poll' for established sockets
1838          * anymore, the common case will execute the loop only once.
1839          *
1840          * Subtle issue: "add_wait_queue_exclusive()" will be added
1841          * after any current non-exclusive waiters, and we know that
1842          * it will always _stay_ after any new non-exclusive waiters
1843          * because all non-exclusive waiters are added at the
1844          * beginning of the wait-queue. As such, it's ok to "drop"
1845          * our exclusiveness temporarily when we get woken up without
1846          * having to remove and re-insert us on the wait queue.
1847          */
1848         for (;;) {
1849                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1850                                           TASK_INTERRUPTIBLE);
1851                 release_sock(sk);
1852                 if (reqsk_queue_empty(&tp->accept_queue))
1853                         timeo = schedule_timeout(timeo);
1854                 lock_sock(sk);
1855                 err = 0;
1856                 if (!reqsk_queue_empty(&tp->accept_queue))
1857                         break;
1858                 err = -EINVAL;
1859                 if (sk->sk_state != TCP_LISTEN)
1860                         break;
1861                 err = sock_intr_errno(timeo);
1862                 if (signal_pending(current))
1863                         break;
1864                 err = -EAGAIN;
1865                 if (!timeo)
1866                         break;
1867         }
1868         finish_wait(sk->sk_sleep, &wait);
1869         return err;
1870 }
1871
1872 /*
1873  *      This will accept the next outstanding connection.
1874  */
1875
1876 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1877 {
1878         struct tcp_sock *tp = tcp_sk(sk);
1879         struct sock *newsk;
1880         int error;
1881
1882         lock_sock(sk);
1883
1884         /* We need to make sure that this socket is listening,
1885          * and that it has something pending.
1886          */
1887         error = -EINVAL;
1888         if (sk->sk_state != TCP_LISTEN)
1889                 goto out_err;
1890
1891         /* Find already established connection */
1892         if (reqsk_queue_empty(&tp->accept_queue)) {
1893                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1894
1895                 /* If this is a non blocking socket don't sleep */
1896                 error = -EAGAIN;
1897                 if (!timeo)
1898                         goto out_err;
1899
1900                 error = wait_for_connect(sk, timeo);
1901                 if (error)
1902                         goto out_err;
1903         }
1904
1905         newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1906         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1907 out:
1908         release_sock(sk);
1909         return newsk;
1910 out_err:
1911         newsk = NULL;
1912         *err = error;
1913         goto out;
1914 }
1915
1916 /*
1917  *      Socket option code for TCP.
1918  */
1919 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1920                    int optlen)
1921 {
1922         struct tcp_sock *tp = tcp_sk(sk);
1923         int val;
1924         int err = 0;
1925
1926         if (level != SOL_TCP)
1927                 return tp->af_specific->setsockopt(sk, level, optname,
1928                                                    optval, optlen);
1929
1930         /* This is a string value all the others are int's */
1931         if (optname == TCP_CONGESTION) {
1932                 char name[TCP_CA_NAME_MAX];
1933
1934                 if (optlen < 1)
1935                         return -EINVAL;
1936
1937                 val = strncpy_from_user(name, optval,
1938                                         min(TCP_CA_NAME_MAX-1, optlen));
1939                 if (val < 0)
1940                         return -EFAULT;
1941                 name[val] = 0;
1942
1943                 lock_sock(sk);
1944                 err = tcp_set_congestion_control(tp, name);
1945                 release_sock(sk);
1946                 return err;
1947         }
1948
1949         if (optlen < sizeof(int))
1950                 return -EINVAL;
1951
1952         if (get_user(val, (int __user *)optval))
1953                 return -EFAULT;
1954
1955         lock_sock(sk);
1956
1957         switch (optname) {
1958         case TCP_MAXSEG:
1959                 /* Values greater than interface MTU won't take effect. However
1960                  * at the point when this call is done we typically don't yet
1961                  * know which interface is going to be used */
1962                 if (val < 8 || val > MAX_TCP_WINDOW) {
1963                         err = -EINVAL;
1964                         break;
1965                 }
1966                 tp->rx_opt.user_mss = val;
1967                 break;
1968
1969         case TCP_NODELAY:
1970                 if (val) {
1971                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1972                          * this option on corked socket is remembered, but
1973                          * it is not activated until cork is cleared.
1974                          *
1975                          * However, when TCP_NODELAY is set we make
1976                          * an explicit push, which overrides even TCP_CORK
1977                          * for currently queued segments.
1978                          */
1979                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1980                         tcp_push_pending_frames(sk, tp);
1981                 } else {
1982                         tp->nonagle &= ~TCP_NAGLE_OFF;
1983                 }
1984                 break;
1985
1986         case TCP_CORK:
1987                 /* When set indicates to always queue non-full frames.
1988                  * Later the user clears this option and we transmit
1989                  * any pending partial frames in the queue.  This is
1990                  * meant to be used alongside sendfile() to get properly
1991                  * filled frames when the user (for example) must write
1992                  * out headers with a write() call first and then use
1993                  * sendfile to send out the data parts.
1994                  *
1995                  * TCP_CORK can be set together with TCP_NODELAY and it is
1996                  * stronger than TCP_NODELAY.
1997                  */
1998                 if (val) {
1999                         tp->nonagle |= TCP_NAGLE_CORK;
2000                 } else {
2001                         tp->nonagle &= ~TCP_NAGLE_CORK;
2002                         if (tp->nonagle&TCP_NAGLE_OFF)
2003                                 tp->nonagle |= TCP_NAGLE_PUSH;
2004                         tcp_push_pending_frames(sk, tp);
2005                 }
2006                 break;
2007
2008         case TCP_KEEPIDLE:
2009                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2010                         err = -EINVAL;
2011                 else {
2012                         tp->keepalive_time = val * HZ;
2013                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2014                             !((1 << sk->sk_state) &
2015                               (TCPF_CLOSE | TCPF_LISTEN))) {
2016                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2017                                 if (tp->keepalive_time > elapsed)
2018                                         elapsed = tp->keepalive_time - elapsed;
2019                                 else
2020                                         elapsed = 0;
2021                                 tcp_reset_keepalive_timer(sk, elapsed);
2022                         }
2023                 }
2024                 break;
2025         case TCP_KEEPINTVL:
2026                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2027                         err = -EINVAL;
2028                 else
2029                         tp->keepalive_intvl = val * HZ;
2030                 break;
2031         case TCP_KEEPCNT:
2032                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2033                         err = -EINVAL;
2034                 else
2035                         tp->keepalive_probes = val;
2036                 break;
2037         case TCP_SYNCNT:
2038                 if (val < 1 || val > MAX_TCP_SYNCNT)
2039                         err = -EINVAL;
2040                 else
2041                         tp->syn_retries = val;
2042                 break;
2043
2044         case TCP_LINGER2:
2045                 if (val < 0)
2046                         tp->linger2 = -1;
2047                 else if (val > sysctl_tcp_fin_timeout / HZ)
2048                         tp->linger2 = 0;
2049                 else
2050                         tp->linger2 = val * HZ;
2051                 break;
2052
2053         case TCP_DEFER_ACCEPT:
2054                 tp->defer_accept = 0;
2055                 if (val > 0) {
2056                         /* Translate value in seconds to number of
2057                          * retransmits */
2058                         while (tp->defer_accept < 32 &&
2059                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2060                                        tp->defer_accept))
2061                                 tp->defer_accept++;
2062                         tp->defer_accept++;
2063                 }
2064                 break;
2065
2066         case TCP_WINDOW_CLAMP:
2067                 if (!val) {
2068                         if (sk->sk_state != TCP_CLOSE) {
2069                                 err = -EINVAL;
2070                                 break;
2071                         }
2072                         tp->window_clamp = 0;
2073                 } else
2074                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2075                                                 SOCK_MIN_RCVBUF / 2 : val;
2076                 break;
2077
2078         case TCP_QUICKACK:
2079                 if (!val) {
2080                         tp->ack.pingpong = 1;
2081                 } else {
2082                         tp->ack.pingpong = 0;
2083                         if ((1 << sk->sk_state) &
2084                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2085                             tcp_ack_scheduled(tp)) {
2086                                 tp->ack.pending |= TCP_ACK_PUSHED;
2087                                 cleanup_rbuf(sk, 1);
2088                                 if (!(val & 1))
2089                                         tp->ack.pingpong = 1;
2090                         }
2091                 }
2092                 break;
2093
2094         default:
2095                 err = -ENOPROTOOPT;
2096                 break;
2097         };
2098         release_sock(sk);
2099         return err;
2100 }
2101
2102 /* Return information about state of tcp endpoint in API format. */
2103 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2104 {
2105         struct tcp_sock *tp = tcp_sk(sk);
2106         u32 now = tcp_time_stamp;
2107
2108         memset(info, 0, sizeof(*info));
2109
2110         info->tcpi_state = sk->sk_state;
2111         info->tcpi_ca_state = tp->ca_state;
2112         info->tcpi_retransmits = tp->retransmits;
2113         info->tcpi_probes = tp->probes_out;
2114         info->tcpi_backoff = tp->backoff;
2115
2116         if (tp->rx_opt.tstamp_ok)
2117                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2118         if (tp->rx_opt.sack_ok)
2119                 info->tcpi_options |= TCPI_OPT_SACK;
2120         if (tp->rx_opt.wscale_ok) {
2121                 info->tcpi_options |= TCPI_OPT_WSCALE;
2122                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2123                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2124         } 
2125
2126         if (tp->ecn_flags&TCP_ECN_OK)
2127                 info->tcpi_options |= TCPI_OPT_ECN;
2128
2129         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2130         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2131         info->tcpi_snd_mss = tp->mss_cache_std;
2132         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2133
2134         info->tcpi_unacked = tp->packets_out;
2135         info->tcpi_sacked = tp->sacked_out;
2136         info->tcpi_lost = tp->lost_out;
2137         info->tcpi_retrans = tp->retrans_out;
2138         info->tcpi_fackets = tp->fackets_out;
2139
2140         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2141         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2142         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2143
2144         info->tcpi_pmtu = tp->pmtu_cookie;
2145         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2146         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2147         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2148         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2149         info->tcpi_snd_cwnd = tp->snd_cwnd;
2150         info->tcpi_advmss = tp->advmss;
2151         info->tcpi_reordering = tp->reordering;
2152
2153         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2154         info->tcpi_rcv_space = tp->rcvq_space.space;
2155
2156         info->tcpi_total_retrans = tp->total_retrans;
2157 }
2158
2159 EXPORT_SYMBOL_GPL(tcp_get_info);
2160
2161 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2162                    int __user *optlen)
2163 {
2164         struct tcp_sock *tp = tcp_sk(sk);
2165         int val, len;
2166
2167         if (level != SOL_TCP)
2168                 return tp->af_specific->getsockopt(sk, level, optname,
2169                                                    optval, optlen);
2170
2171         if (get_user(len, optlen))
2172                 return -EFAULT;
2173
2174         len = min_t(unsigned int, len, sizeof(int));
2175
2176         if (len < 0)
2177                 return -EINVAL;
2178
2179         switch (optname) {
2180         case TCP_MAXSEG:
2181                 val = tp->mss_cache_std;
2182                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2183                         val = tp->rx_opt.user_mss;
2184                 break;
2185         case TCP_NODELAY:
2186                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2187                 break;
2188         case TCP_CORK:
2189                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2190                 break;
2191         case TCP_KEEPIDLE:
2192                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2193                 break;
2194         case TCP_KEEPINTVL:
2195                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2196                 break;
2197         case TCP_KEEPCNT:
2198                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2199                 break;
2200         case TCP_SYNCNT:
2201                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2202                 break;
2203         case TCP_LINGER2:
2204                 val = tp->linger2;
2205                 if (val >= 0)
2206                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2207                 break;
2208         case TCP_DEFER_ACCEPT:
2209                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2210                                                (tp->defer_accept - 1));
2211                 break;
2212         case TCP_WINDOW_CLAMP:
2213                 val = tp->window_clamp;
2214                 break;
2215         case TCP_INFO: {
2216                 struct tcp_info info;
2217
2218                 if (get_user(len, optlen))
2219                         return -EFAULT;
2220
2221                 tcp_get_info(sk, &info);
2222
2223                 len = min_t(unsigned int, len, sizeof(info));
2224                 if (put_user(len, optlen))
2225                         return -EFAULT;
2226                 if (copy_to_user(optval, &info, len))
2227                         return -EFAULT;
2228                 return 0;
2229         }
2230         case TCP_QUICKACK:
2231                 val = !tp->ack.pingpong;
2232                 break;
2233
2234         case TCP_CONGESTION:
2235                 if (get_user(len, optlen))
2236                         return -EFAULT;
2237                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2238                 if (put_user(len, optlen))
2239                         return -EFAULT;
2240                 if (copy_to_user(optval, tp->ca_ops->name, len))
2241                         return -EFAULT;
2242                 return 0;
2243         default:
2244                 return -ENOPROTOOPT;
2245         };
2246
2247         if (put_user(len, optlen))
2248                 return -EFAULT;
2249         if (copy_to_user(optval, &val, len))
2250                 return -EFAULT;
2251         return 0;
2252 }
2253
2254
2255 extern void __skb_cb_too_small_for_tcp(int, int);
2256 extern struct tcp_congestion_ops tcp_reno;
2257
2258 static __initdata unsigned long thash_entries;
2259 static int __init set_thash_entries(char *str)
2260 {
2261         if (!str)
2262                 return 0;
2263         thash_entries = simple_strtoul(str, &str, 0);
2264         return 1;
2265 }
2266 __setup("thash_entries=", set_thash_entries);
2267
2268 void __init tcp_init(void)
2269 {
2270         struct sk_buff *skb = NULL;
2271         int order, i;
2272
2273         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2274                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2275                                            sizeof(skb->cb));
2276
2277         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2278                                               sizeof(struct tcp_bind_bucket),
2279                                               0, SLAB_HWCACHE_ALIGN,
2280                                               NULL, NULL);
2281         if (!tcp_bucket_cachep)
2282                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2283
2284         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2285                                                 sizeof(struct tcp_tw_bucket),
2286                                                 0, SLAB_HWCACHE_ALIGN,
2287                                                 NULL, NULL);
2288         if (!tcp_timewait_cachep)
2289                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2290
2291         /* Size and allocate the main established and bind bucket
2292          * hash tables.
2293          *
2294          * The methodology is similar to that of the buffer cache.
2295          */
2296         tcp_ehash = (struct tcp_ehash_bucket *)
2297                 alloc_large_system_hash("TCP established",
2298                                         sizeof(struct tcp_ehash_bucket),
2299                                         thash_entries,
2300                                         (num_physpages >= 128 * 1024) ?
2301                                                 (25 - PAGE_SHIFT) :
2302                                                 (27 - PAGE_SHIFT),
2303                                         HASH_HIGHMEM,
2304                                         &tcp_ehash_size,
2305                                         NULL,
2306                                         0);
2307         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2308         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2309                 rwlock_init(&tcp_ehash[i].lock);
2310                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2311         }
2312
2313         tcp_bhash = (struct tcp_bind_hashbucket *)
2314                 alloc_large_system_hash("TCP bind",
2315                                         sizeof(struct tcp_bind_hashbucket),
2316                                         tcp_ehash_size,
2317                                         (num_physpages >= 128 * 1024) ?
2318                                                 (25 - PAGE_SHIFT) :
2319                                                 (27 - PAGE_SHIFT),
2320                                         HASH_HIGHMEM,
2321                                         &tcp_bhash_size,
2322                                         NULL,
2323                                         64 * 1024);
2324         tcp_bhash_size = 1 << tcp_bhash_size;
2325         for (i = 0; i < tcp_bhash_size; i++) {
2326                 spin_lock_init(&tcp_bhash[i].lock);
2327                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2328         }
2329
2330         /* Try to be a bit smarter and adjust defaults depending
2331          * on available memory.
2332          */
2333         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2334                         (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
2335                         order++)
2336                 ;
2337         if (order >= 4) {
2338                 sysctl_local_port_range[0] = 32768;
2339                 sysctl_local_port_range[1] = 61000;
2340                 sysctl_tcp_max_tw_buckets = 180000;
2341                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2342                 sysctl_max_syn_backlog = 1024;
2343         } else if (order < 3) {
2344                 sysctl_local_port_range[0] = 1024 * (3 - order);
2345                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2346                 sysctl_tcp_max_orphans >>= (3 - order);
2347                 sysctl_max_syn_backlog = 128;
2348         }
2349         tcp_port_rover = sysctl_local_port_range[0] - 1;
2350
2351         sysctl_tcp_mem[0] =  768 << order;
2352         sysctl_tcp_mem[1] = 1024 << order;
2353         sysctl_tcp_mem[2] = 1536 << order;
2354
2355         if (order < 3) {
2356                 sysctl_tcp_wmem[2] = 64 * 1024;
2357                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2358                 sysctl_tcp_rmem[1] = 43689;
2359                 sysctl_tcp_rmem[2] = 2 * 43689;
2360         }
2361
2362         printk(KERN_INFO "TCP: Hash tables configured "
2363                "(established %d bind %d)\n",
2364                tcp_ehash_size << 1, tcp_bhash_size);
2365
2366         tcp_register_congestion_control(&tcp_reno);
2367 }
2368
2369 EXPORT_SYMBOL(tcp_accept);
2370 EXPORT_SYMBOL(tcp_close);
2371 EXPORT_SYMBOL(tcp_destroy_sock);
2372 EXPORT_SYMBOL(tcp_disconnect);
2373 EXPORT_SYMBOL(tcp_getsockopt);
2374 EXPORT_SYMBOL(tcp_ioctl);
2375 EXPORT_SYMBOL(tcp_poll);
2376 EXPORT_SYMBOL(tcp_read_sock);
2377 EXPORT_SYMBOL(tcp_recvmsg);
2378 EXPORT_SYMBOL(tcp_sendmsg);
2379 EXPORT_SYMBOL(tcp_sendpage);
2380 EXPORT_SYMBOL(tcp_setsockopt);
2381 EXPORT_SYMBOL(tcp_shutdown);
2382 EXPORT_SYMBOL(tcp_statistics);
2383 EXPORT_SYMBOL(tcp_timewait_cachep);