[INET]: Just rename the TCP hashtable functions/structs to inet_
[linux-2.6] / net / ipv4 / tcp.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *              Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *              Alan Cox        :       Numerous verify_area() calls
24  *              Alan Cox        :       Set the ACK bit on a reset
25  *              Alan Cox        :       Stopped it crashing if it closed while
26  *                                      sk->inuse=1 and was trying to connect
27  *                                      (tcp_err()).
28  *              Alan Cox        :       All icmp error handling was broken
29  *                                      pointers passed where wrong and the
30  *                                      socket was looked up backwards. Nobody
31  *                                      tested any icmp error code obviously.
32  *              Alan Cox        :       tcp_err() now handled properly. It
33  *                                      wakes people on errors. poll
34  *                                      behaves and the icmp error race
35  *                                      has gone by moving it into sock.c
36  *              Alan Cox        :       tcp_send_reset() fixed to work for
37  *                                      everything not just packets for
38  *                                      unknown sockets.
39  *              Alan Cox        :       tcp option processing.
40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
41  *                                      syn rule wrong]
42  *              Herp Rosmanith  :       More reset fixes
43  *              Alan Cox        :       No longer acks invalid rst frames.
44  *                                      Acking any kind of RST is right out.
45  *              Alan Cox        :       Sets an ignore me flag on an rst
46  *                                      receive otherwise odd bits of prattle
47  *                                      escape still
48  *              Alan Cox        :       Fixed another acking RST frame bug.
49  *                                      Should stop LAN workplace lockups.
50  *              Alan Cox        :       Some tidyups using the new skb list
51  *                                      facilities
52  *              Alan Cox        :       sk->keepopen now seems to work
53  *              Alan Cox        :       Pulls options out correctly on accepts
54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
56  *                                      bit to skb ops.
57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
58  *                                      nasty.
59  *              Alan Cox        :       Added some better commenting, as the
60  *                                      tcp is hard to follow
61  *              Alan Cox        :       Removed incorrect check for 20 * psh
62  *      Michael O'Reilly        :       ack < copied bug fix.
63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
64  *              Alan Cox        :       FIN with no memory -> CRASH
65  *              Alan Cox        :       Added socket option proto entries.
66  *                                      Also added awareness of them to accept.
67  *              Alan Cox        :       Added TCP options (SOL_TCP)
68  *              Alan Cox        :       Switched wakeup calls to callbacks,
69  *                                      so the kernel can layer network
70  *                                      sockets.
71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
73  *              Alan Cox        :       RST frames sent on unsynchronised
74  *                                      state ack error.
75  *              Alan Cox        :       Put in missing check for SYN bit.
76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
77  *                                      window non shrink trick.
78  *              Alan Cox        :       Added a couple of small NET2E timer
79  *                                      fixes
80  *              Charles Hedrick :       TCP fixes
81  *              Toomas Tamm     :       TCP window fixes
82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
83  *              Charles Hedrick :       Rewrote most of it to actually work
84  *              Linus           :       Rewrote tcp_read() and URG handling
85  *                                      completely
86  *              Gerhard Koerting:       Fixed some missing timer handling
87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
88  *              Gerhard Koerting:       PC/TCP workarounds
89  *              Adam Caldwell   :       Assorted timer/timing errors
90  *              Matthew Dillon  :       Fixed another RST bug
91  *              Alan Cox        :       Move to kernel side addressing changes.
92  *              Alan Cox        :       Beginning work on TCP fastpathing
93  *                                      (not yet usable)
94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
95  *              Alan Cox        :       TCP fast path debugging
96  *              Alan Cox        :       Window clamping
97  *              Michael Riepe   :       Bug in tcp_check()
98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
99  *              Matt Dillon     :       Yet more small nasties remove from the
100  *                                      TCP code (Be very nice to this man if
101  *                                      tcp finally works 100%) 8)
102  *              Alan Cox        :       BSD accept semantics.
103  *              Alan Cox        :       Reset on closedown bug.
104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
105  *              Michael Pall    :       Handle poll() after URG properly in
106  *                                      all cases.
107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
108  *                                      (multi URG PUSH broke rlogin).
109  *              Michael Pall    :       Fix the multi URG PUSH problem in
110  *                                      tcp_readable(), poll() after URG
111  *                                      works now.
112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
113  *                                      BSD api.
114  *              Alan Cox        :       Changed the semantics of sk->socket to
115  *                                      fix a race and a signal problem with
116  *                                      accept() and async I/O.
117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
120  *                                      clients/servers which listen in on
121  *                                      fixed ports.
122  *              Alan Cox        :       Cleaned the above up and shrank it to
123  *                                      a sensible code size.
124  *              Alan Cox        :       Self connect lockup fix.
125  *              Alan Cox        :       No connect to multicast.
126  *              Ross Biro       :       Close unaccepted children on master
127  *                                      socket close.
128  *              Alan Cox        :       Reset tracing code.
129  *              Alan Cox        :       Spurious resets on shutdown.
130  *              Alan Cox        :       Giant 15 minute/60 second timer error
131  *              Alan Cox        :       Small whoops in polling before an
132  *                                      accept.
133  *              Alan Cox        :       Kept the state trace facility since
134  *                                      it's handy for debugging.
135  *              Alan Cox        :       More reset handler fixes.
136  *              Alan Cox        :       Started rewriting the code based on
137  *                                      the RFC's for other useful protocol
138  *                                      references see: Comer, KA9Q NOS, and
139  *                                      for a reference on the difference
140  *                                      between specifications and how BSD
141  *                                      works see the 4.4lite source.
142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
143  *                                      close.
144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
146  *              Alan Cox        :       Reimplemented timers as per the RFC
147  *                                      and using multiple timers for sanity.
148  *              Alan Cox        :       Small bug fixes, and a lot of new
149  *                                      comments.
150  *              Alan Cox        :       Fixed dual reader crash by locking
151  *                                      the buffers (much like datagram.c)
152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
153  *                                      now gets fed up of retrying without
154  *                                      (even a no space) answer.
155  *              Alan Cox        :       Extracted closing code better
156  *              Alan Cox        :       Fixed the closing state machine to
157  *                                      resemble the RFC.
158  *              Alan Cox        :       More 'per spec' fixes.
159  *              Jorge Cwik      :       Even faster checksumming.
160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
161  *                                      only frames. At least one pc tcp stack
162  *                                      generates them.
163  *              Alan Cox        :       Cache last socket.
164  *              Alan Cox        :       Per route irtt.
165  *              Matt Day        :       poll()->select() match BSD precisely on error
166  *              Alan Cox        :       New buffers
167  *              Marc Tamsky     :       Various sk->prot->retransmits and
168  *                                      sk->retransmits misupdating fixed.
169  *                                      Fixed tcp_write_timeout: stuck close,
170  *                                      and TCP syn retries gets used now.
171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
172  *                                      ack if state is TCP_CLOSED.
173  *              Alan Cox        :       Look up device on a retransmit - routes may
174  *                                      change. Doesn't yet cope with MSS shrink right
175  *                                      but it's a start!
176  *              Marc Tamsky     :       Closing in closing fixes.
177  *              Mike Shaver     :       RFC1122 verifications.
178  *              Alan Cox        :       rcv_saddr errors.
179  *              Alan Cox        :       Block double connect().
180  *              Alan Cox        :       Small hooks for enSKIP.
181  *              Alexey Kuznetsov:       Path MTU discovery.
182  *              Alan Cox        :       Support soft errors.
183  *              Alan Cox        :       Fix MTU discovery pathological case
184  *                                      when the remote claims no mtu!
185  *              Marc Tamsky     :       TCP_CLOSE fix.
186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
187  *                                      window but wrong (fixes NT lpd problems)
188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
189  *              Joerg Reuter    :       No modification of locked buffers in
190  *                                      tcp_do_retransmit()
191  *              Eric Schenk     :       Changed receiver side silly window
192  *                                      avoidance algorithm to BSD style
193  *                                      algorithm. This doubles throughput
194  *                                      against machines running Solaris,
195  *                                      and seems to result in general
196  *                                      improvement.
197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
198  *      Willy Konynenberg       :       Transparent proxying support.
199  *      Mike McLagan            :       Routing by source
200  *              Keith Owens     :       Do proper merging with partial SKB's in
201  *                                      tcp_do_sendmsg to avoid burstiness.
202  *              Eric Schenk     :       Fix fast close down bug with
203  *                                      shutdown() followed by close().
204  *              Andi Kleen      :       Make poll agree with SIGIO
205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
206  *                                      lingertime == 0 (RFC 793 ABORT Call)
207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
208  *                                      csum_and_copy_from_user() if possible.
209  *
210  *              This program is free software; you can redistribute it and/or
211  *              modify it under the terms of the GNU General Public License
212  *              as published by the Free Software Foundation; either version
213  *              2 of the License, or(at your option) any later version.
214  *
215  * Description of States:
216  *
217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
218  *
219  *      TCP_SYN_RECV            received a connection request, sent ack,
220  *                              waiting for final ack in three-way handshake.
221  *
222  *      TCP_ESTABLISHED         connection established
223  *
224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
225  *                              transmission of remaining buffered data
226  *
227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
228  *                              to shutdown
229  *
230  *      TCP_CLOSING             both sides have shutdown but we still have
231  *                              data we have to finish sending
232  *
233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
234  *                              closed, can only be entered from FIN_WAIT2
235  *                              or CLOSING.  Required because the other end
236  *                              may not have gotten our last ACK causing it
237  *                              to retransmit the data packet (which we ignore)
238  *
239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
240  *                              us to finish writing our data and to shutdown
241  *                              (we have to close() to move on to LAST_ACK)
242  *
243  *      TCP_LAST_ACK            out side has shutdown after remote has
244  *                              shutdown.  There may still be data in our
245  *                              buffer that we have to finish sending
246  *
247  *      TCP_CLOSE               socket is finished
248  */
249
250 #include <linux/config.h>
251 #include <linux/module.h>
252 #include <linux/types.h>
253 #include <linux/fcntl.h>
254 #include <linux/poll.h>
255 #include <linux/init.h>
256 #include <linux/smp_lock.h>
257 #include <linux/fs.h>
258 #include <linux/random.h>
259 #include <linux/bootmem.h>
260
261 #include <net/icmp.h>
262 #include <net/tcp.h>
263 #include <net/xfrm.h>
264 #include <net/ip.h>
265
266
267 #include <asm/uaccess.h>
268 #include <asm/ioctls.h>
269
270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
271
272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
273
274 kmem_cache_t *tcp_bucket_cachep;
275
276 EXPORT_SYMBOL_GPL(tcp_bucket_cachep);
277
278 kmem_cache_t *tcp_timewait_cachep;
279
280 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
281
282 int sysctl_tcp_mem[3];
283 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
284 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
285
286 EXPORT_SYMBOL(sysctl_tcp_mem);
287 EXPORT_SYMBOL(sysctl_tcp_rmem);
288 EXPORT_SYMBOL(sysctl_tcp_wmem);
289
290 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
291 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
292
293 EXPORT_SYMBOL(tcp_memory_allocated);
294 EXPORT_SYMBOL(tcp_sockets_allocated);
295
296 /*
297  * Pressure flag: try to collapse.
298  * Technical note: it is used by multiple contexts non atomically.
299  * All the sk_stream_mem_schedule() is of this nature: accounting
300  * is strict, actions are advisory and have some latency.
301  */
302 int tcp_memory_pressure;
303
304 EXPORT_SYMBOL(tcp_memory_pressure);
305
306 void tcp_enter_memory_pressure(void)
307 {
308         if (!tcp_memory_pressure) {
309                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
310                 tcp_memory_pressure = 1;
311         }
312 }
313
314 EXPORT_SYMBOL(tcp_enter_memory_pressure);
315
316 /*
317  * LISTEN is a special case for poll..
318  */
319 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
320                                                poll_table *wait)
321 {
322         return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
323 }
324
325 /*
326  *      Wait for a TCP event.
327  *
328  *      Note that we don't need to lock the socket, as the upper poll layers
329  *      take care of normal races (between the test and the event) and we don't
330  *      go look at any of the socket buffers directly.
331  */
332 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
333 {
334         unsigned int mask;
335         struct sock *sk = sock->sk;
336         struct tcp_sock *tp = tcp_sk(sk);
337
338         poll_wait(file, sk->sk_sleep, wait);
339         if (sk->sk_state == TCP_LISTEN)
340                 return tcp_listen_poll(sk, wait);
341
342         /* Socket is not locked. We are protected from async events
343            by poll logic and correct handling of state changes
344            made by another threads is impossible in any case.
345          */
346
347         mask = 0;
348         if (sk->sk_err)
349                 mask = POLLERR;
350
351         /*
352          * POLLHUP is certainly not done right. But poll() doesn't
353          * have a notion of HUP in just one direction, and for a
354          * socket the read side is more interesting.
355          *
356          * Some poll() documentation says that POLLHUP is incompatible
357          * with the POLLOUT/POLLWR flags, so somebody should check this
358          * all. But careful, it tends to be safer to return too many
359          * bits than too few, and you can easily break real applications
360          * if you don't tell them that something has hung up!
361          *
362          * Check-me.
363          *
364          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
365          * our fs/select.c). It means that after we received EOF,
366          * poll always returns immediately, making impossible poll() on write()
367          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
368          * if and only if shutdown has been made in both directions.
369          * Actually, it is interesting to look how Solaris and DUX
370          * solve this dilemma. I would prefer, if PULLHUP were maskable,
371          * then we could set it on SND_SHUTDOWN. BTW examples given
372          * in Stevens' books assume exactly this behaviour, it explains
373          * why PULLHUP is incompatible with POLLOUT.    --ANK
374          *
375          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
376          * blocking on fresh not-connected or disconnected socket. --ANK
377          */
378         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
379                 mask |= POLLHUP;
380         if (sk->sk_shutdown & RCV_SHUTDOWN)
381                 mask |= POLLIN | POLLRDNORM;
382
383         /* Connected? */
384         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
385                 /* Potential race condition. If read of tp below will
386                  * escape above sk->sk_state, we can be illegally awaken
387                  * in SYN_* states. */
388                 if ((tp->rcv_nxt != tp->copied_seq) &&
389                     (tp->urg_seq != tp->copied_seq ||
390                      tp->rcv_nxt != tp->copied_seq + 1 ||
391                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
392                         mask |= POLLIN | POLLRDNORM;
393
394                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
395                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
396                                 mask |= POLLOUT | POLLWRNORM;
397                         } else {  /* send SIGIO later */
398                                 set_bit(SOCK_ASYNC_NOSPACE,
399                                         &sk->sk_socket->flags);
400                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
401
402                                 /* Race breaker. If space is freed after
403                                  * wspace test but before the flags are set,
404                                  * IO signal will be lost.
405                                  */
406                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
407                                         mask |= POLLOUT | POLLWRNORM;
408                         }
409                 }
410
411                 if (tp->urg_data & TCP_URG_VALID)
412                         mask |= POLLPRI;
413         }
414         return mask;
415 }
416
417 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
418 {
419         struct tcp_sock *tp = tcp_sk(sk);
420         int answ;
421
422         switch (cmd) {
423         case SIOCINQ:
424                 if (sk->sk_state == TCP_LISTEN)
425                         return -EINVAL;
426
427                 lock_sock(sk);
428                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
429                         answ = 0;
430                 else if (sock_flag(sk, SOCK_URGINLINE) ||
431                          !tp->urg_data ||
432                          before(tp->urg_seq, tp->copied_seq) ||
433                          !before(tp->urg_seq, tp->rcv_nxt)) {
434                         answ = tp->rcv_nxt - tp->copied_seq;
435
436                         /* Subtract 1, if FIN is in queue. */
437                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
438                                 answ -=
439                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
440                 } else
441                         answ = tp->urg_seq - tp->copied_seq;
442                 release_sock(sk);
443                 break;
444         case SIOCATMARK:
445                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
446                 break;
447         case SIOCOUTQ:
448                 if (sk->sk_state == TCP_LISTEN)
449                         return -EINVAL;
450
451                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
452                         answ = 0;
453                 else
454                         answ = tp->write_seq - tp->snd_una;
455                 break;
456         default:
457                 return -ENOIOCTLCMD;
458         };
459
460         return put_user(answ, (int __user *)arg);
461 }
462
463
464 int tcp_listen_start(struct sock *sk)
465 {
466         struct inet_sock *inet = inet_sk(sk);
467         struct tcp_sock *tp = tcp_sk(sk);
468         int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
469
470         if (rc != 0)
471                 return rc;
472
473         sk->sk_max_ack_backlog = 0;
474         sk->sk_ack_backlog = 0;
475         tcp_delack_init(tp);
476
477         /* There is race window here: we announce ourselves listening,
478          * but this transition is still not validated by get_port().
479          * It is OK, because this socket enters to hash table only
480          * after validation is complete.
481          */
482         sk->sk_state = TCP_LISTEN;
483         if (!sk->sk_prot->get_port(sk, inet->num)) {
484                 inet->sport = htons(inet->num);
485
486                 sk_dst_reset(sk);
487                 sk->sk_prot->hash(sk);
488
489                 return 0;
490         }
491
492         sk->sk_state = TCP_CLOSE;
493         __reqsk_queue_destroy(&tp->accept_queue);
494         return -EADDRINUSE;
495 }
496
497 /*
498  *      This routine closes sockets which have been at least partially
499  *      opened, but not yet accepted.
500  */
501
502 static void tcp_listen_stop (struct sock *sk)
503 {
504         struct tcp_sock *tp = tcp_sk(sk);
505         struct request_sock *acc_req;
506         struct request_sock *req;
507
508         tcp_delete_keepalive_timer(sk);
509
510         /* make all the listen_opt local to us */
511         acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
512
513         /* Following specs, it would be better either to send FIN
514          * (and enter FIN-WAIT-1, it is normal close)
515          * or to send active reset (abort).
516          * Certainly, it is pretty dangerous while synflood, but it is
517          * bad justification for our negligence 8)
518          * To be honest, we are not able to make either
519          * of the variants now.                 --ANK
520          */
521         reqsk_queue_destroy(&tp->accept_queue);
522
523         while ((req = acc_req) != NULL) {
524                 struct sock *child = req->sk;
525
526                 acc_req = req->dl_next;
527
528                 local_bh_disable();
529                 bh_lock_sock(child);
530                 BUG_TRAP(!sock_owned_by_user(child));
531                 sock_hold(child);
532
533                 tcp_disconnect(child, O_NONBLOCK);
534
535                 sock_orphan(child);
536
537                 atomic_inc(&tcp_orphan_count);
538
539                 tcp_destroy_sock(child);
540
541                 bh_unlock_sock(child);
542                 local_bh_enable();
543                 sock_put(child);
544
545                 sk_acceptq_removed(sk);
546                 __reqsk_free(req);
547         }
548         BUG_TRAP(!sk->sk_ack_backlog);
549 }
550
551 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
552 {
553         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
554         tp->pushed_seq = tp->write_seq;
555 }
556
557 static inline int forced_push(struct tcp_sock *tp)
558 {
559         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
560 }
561
562 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
563                               struct sk_buff *skb)
564 {
565         skb->csum = 0;
566         TCP_SKB_CB(skb)->seq = tp->write_seq;
567         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
568         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
569         TCP_SKB_CB(skb)->sacked = 0;
570         skb_header_release(skb);
571         __skb_queue_tail(&sk->sk_write_queue, skb);
572         sk_charge_skb(sk, skb);
573         if (!sk->sk_send_head)
574                 sk->sk_send_head = skb;
575         if (tp->nonagle & TCP_NAGLE_PUSH)
576                 tp->nonagle &= ~TCP_NAGLE_PUSH; 
577 }
578
579 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
580                                 struct sk_buff *skb)
581 {
582         if (flags & MSG_OOB) {
583                 tp->urg_mode = 1;
584                 tp->snd_up = tp->write_seq;
585                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
586         }
587 }
588
589 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
590                             int mss_now, int nonagle)
591 {
592         if (sk->sk_send_head) {
593                 struct sk_buff *skb = sk->sk_write_queue.prev;
594                 if (!(flags & MSG_MORE) || forced_push(tp))
595                         tcp_mark_push(tp, skb);
596                 tcp_mark_urg(tp, flags, skb);
597                 __tcp_push_pending_frames(sk, tp, mss_now,
598                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
599         }
600 }
601
602 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
603                          size_t psize, int flags)
604 {
605         struct tcp_sock *tp = tcp_sk(sk);
606         int mss_now, size_goal;
607         int err;
608         ssize_t copied;
609         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
610
611         /* Wait for a connection to finish. */
612         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
613                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
614                         goto out_err;
615
616         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
617
618         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
619         size_goal = tp->xmit_size_goal;
620         copied = 0;
621
622         err = -EPIPE;
623         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
624                 goto do_error;
625
626         while (psize > 0) {
627                 struct sk_buff *skb = sk->sk_write_queue.prev;
628                 struct page *page = pages[poffset / PAGE_SIZE];
629                 int copy, i, can_coalesce;
630                 int offset = poffset % PAGE_SIZE;
631                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
632
633                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
634 new_segment:
635                         if (!sk_stream_memory_free(sk))
636                                 goto wait_for_sndbuf;
637
638                         skb = sk_stream_alloc_pskb(sk, 0, 0,
639                                                    sk->sk_allocation);
640                         if (!skb)
641                                 goto wait_for_memory;
642
643                         skb_entail(sk, tp, skb);
644                         copy = size_goal;
645                 }
646
647                 if (copy > size)
648                         copy = size;
649
650                 i = skb_shinfo(skb)->nr_frags;
651                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
652                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
653                         tcp_mark_push(tp, skb);
654                         goto new_segment;
655                 }
656                 if (sk->sk_forward_alloc < copy &&
657                     !sk_stream_mem_schedule(sk, copy, 0))
658                         goto wait_for_memory;
659                 
660                 if (can_coalesce) {
661                         skb_shinfo(skb)->frags[i - 1].size += copy;
662                 } else {
663                         get_page(page);
664                         skb_fill_page_desc(skb, i, page, offset, copy);
665                 }
666
667                 skb->len += copy;
668                 skb->data_len += copy;
669                 skb->truesize += copy;
670                 sk->sk_wmem_queued += copy;
671                 sk->sk_forward_alloc -= copy;
672                 skb->ip_summed = CHECKSUM_HW;
673                 tp->write_seq += copy;
674                 TCP_SKB_CB(skb)->end_seq += copy;
675                 skb_shinfo(skb)->tso_segs = 0;
676
677                 if (!copied)
678                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
679
680                 copied += copy;
681                 poffset += copy;
682                 if (!(psize -= copy))
683                         goto out;
684
685                 if (skb->len < mss_now || (flags & MSG_OOB))
686                         continue;
687
688                 if (forced_push(tp)) {
689                         tcp_mark_push(tp, skb);
690                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
691                 } else if (skb == sk->sk_send_head)
692                         tcp_push_one(sk, mss_now);
693                 continue;
694
695 wait_for_sndbuf:
696                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
697 wait_for_memory:
698                 if (copied)
699                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
700
701                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
702                         goto do_error;
703
704                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
705                 size_goal = tp->xmit_size_goal;
706         }
707
708 out:
709         if (copied)
710                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
711         return copied;
712
713 do_error:
714         if (copied)
715                 goto out;
716 out_err:
717         return sk_stream_error(sk, flags, err);
718 }
719
720 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
721                      size_t size, int flags)
722 {
723         ssize_t res;
724         struct sock *sk = sock->sk;
725
726 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
727
728         if (!(sk->sk_route_caps & NETIF_F_SG) ||
729             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
730                 return sock_no_sendpage(sock, page, offset, size, flags);
731
732 #undef TCP_ZC_CSUM_FLAGS
733
734         lock_sock(sk);
735         TCP_CHECK_TIMER(sk);
736         res = do_tcp_sendpages(sk, &page, offset, size, flags);
737         TCP_CHECK_TIMER(sk);
738         release_sock(sk);
739         return res;
740 }
741
742 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
743 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
744
745 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
746 {
747         int tmp = tp->mss_cache;
748
749         if (sk->sk_route_caps & NETIF_F_SG) {
750                 if (sk->sk_route_caps & NETIF_F_TSO)
751                         tmp = 0;
752                 else {
753                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
754
755                         if (tmp >= pgbreak &&
756                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
757                                 tmp = pgbreak;
758                 }
759         }
760
761         return tmp;
762 }
763
764 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
765                 size_t size)
766 {
767         struct iovec *iov;
768         struct tcp_sock *tp = tcp_sk(sk);
769         struct sk_buff *skb;
770         int iovlen, flags;
771         int mss_now, size_goal;
772         int err, copied;
773         long timeo;
774
775         lock_sock(sk);
776         TCP_CHECK_TIMER(sk);
777
778         flags = msg->msg_flags;
779         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
780
781         /* Wait for a connection to finish. */
782         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
783                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
784                         goto out_err;
785
786         /* This should be in poll */
787         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
788
789         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
790         size_goal = tp->xmit_size_goal;
791
792         /* Ok commence sending. */
793         iovlen = msg->msg_iovlen;
794         iov = msg->msg_iov;
795         copied = 0;
796
797         err = -EPIPE;
798         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
799                 goto do_error;
800
801         while (--iovlen >= 0) {
802                 int seglen = iov->iov_len;
803                 unsigned char __user *from = iov->iov_base;
804
805                 iov++;
806
807                 while (seglen > 0) {
808                         int copy;
809
810                         skb = sk->sk_write_queue.prev;
811
812                         if (!sk->sk_send_head ||
813                             (copy = size_goal - skb->len) <= 0) {
814
815 new_segment:
816                                 /* Allocate new segment. If the interface is SG,
817                                  * allocate skb fitting to single page.
818                                  */
819                                 if (!sk_stream_memory_free(sk))
820                                         goto wait_for_sndbuf;
821
822                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
823                                                            0, sk->sk_allocation);
824                                 if (!skb)
825                                         goto wait_for_memory;
826
827                                 /*
828                                  * Check whether we can use HW checksum.
829                                  */
830                                 if (sk->sk_route_caps &
831                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
832                                      NETIF_F_HW_CSUM))
833                                         skb->ip_summed = CHECKSUM_HW;
834
835                                 skb_entail(sk, tp, skb);
836                                 copy = size_goal;
837                         }
838
839                         /* Try to append data to the end of skb. */
840                         if (copy > seglen)
841                                 copy = seglen;
842
843                         /* Where to copy to? */
844                         if (skb_tailroom(skb) > 0) {
845                                 /* We have some space in skb head. Superb! */
846                                 if (copy > skb_tailroom(skb))
847                                         copy = skb_tailroom(skb);
848                                 if ((err = skb_add_data(skb, from, copy)) != 0)
849                                         goto do_fault;
850                         } else {
851                                 int merge = 0;
852                                 int i = skb_shinfo(skb)->nr_frags;
853                                 struct page *page = TCP_PAGE(sk);
854                                 int off = TCP_OFF(sk);
855
856                                 if (skb_can_coalesce(skb, i, page, off) &&
857                                     off != PAGE_SIZE) {
858                                         /* We can extend the last page
859                                          * fragment. */
860                                         merge = 1;
861                                 } else if (i == MAX_SKB_FRAGS ||
862                                            (!i &&
863                                            !(sk->sk_route_caps & NETIF_F_SG))) {
864                                         /* Need to add new fragment and cannot
865                                          * do this because interface is non-SG,
866                                          * or because all the page slots are
867                                          * busy. */
868                                         tcp_mark_push(tp, skb);
869                                         goto new_segment;
870                                 } else if (page) {
871                                         if (off == PAGE_SIZE) {
872                                                 put_page(page);
873                                                 TCP_PAGE(sk) = page = NULL;
874                                         }
875                                 }
876
877                                 if (!page) {
878                                         /* Allocate new cache page. */
879                                         if (!(page = sk_stream_alloc_page(sk)))
880                                                 goto wait_for_memory;
881                                         off = 0;
882                                 }
883
884                                 if (copy > PAGE_SIZE - off)
885                                         copy = PAGE_SIZE - off;
886
887                                 /* Time to copy data. We are close to
888                                  * the end! */
889                                 err = skb_copy_to_page(sk, from, skb, page,
890                                                        off, copy);
891                                 if (err) {
892                                         /* If this page was new, give it to the
893                                          * socket so it does not get leaked.
894                                          */
895                                         if (!TCP_PAGE(sk)) {
896                                                 TCP_PAGE(sk) = page;
897                                                 TCP_OFF(sk) = 0;
898                                         }
899                                         goto do_error;
900                                 }
901
902                                 /* Update the skb. */
903                                 if (merge) {
904                                         skb_shinfo(skb)->frags[i - 1].size +=
905                                                                         copy;
906                                 } else {
907                                         skb_fill_page_desc(skb, i, page, off, copy);
908                                         if (TCP_PAGE(sk)) {
909                                                 get_page(page);
910                                         } else if (off + copy < PAGE_SIZE) {
911                                                 get_page(page);
912                                                 TCP_PAGE(sk) = page;
913                                         }
914                                 }
915
916                                 TCP_OFF(sk) = off + copy;
917                         }
918
919                         if (!copied)
920                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
921
922                         tp->write_seq += copy;
923                         TCP_SKB_CB(skb)->end_seq += copy;
924                         skb_shinfo(skb)->tso_segs = 0;
925
926                         from += copy;
927                         copied += copy;
928                         if ((seglen -= copy) == 0 && iovlen == 0)
929                                 goto out;
930
931                         if (skb->len < mss_now || (flags & MSG_OOB))
932                                 continue;
933
934                         if (forced_push(tp)) {
935                                 tcp_mark_push(tp, skb);
936                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
937                         } else if (skb == sk->sk_send_head)
938                                 tcp_push_one(sk, mss_now);
939                         continue;
940
941 wait_for_sndbuf:
942                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
943 wait_for_memory:
944                         if (copied)
945                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
946
947                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
948                                 goto do_error;
949
950                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
951                         size_goal = tp->xmit_size_goal;
952                 }
953         }
954
955 out:
956         if (copied)
957                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
958         TCP_CHECK_TIMER(sk);
959         release_sock(sk);
960         return copied;
961
962 do_fault:
963         if (!skb->len) {
964                 if (sk->sk_send_head == skb)
965                         sk->sk_send_head = NULL;
966                 __skb_unlink(skb, &sk->sk_write_queue);
967                 sk_stream_free_skb(sk, skb);
968         }
969
970 do_error:
971         if (copied)
972                 goto out;
973 out_err:
974         err = sk_stream_error(sk, flags, err);
975         TCP_CHECK_TIMER(sk);
976         release_sock(sk);
977         return err;
978 }
979
980 /*
981  *      Handle reading urgent data. BSD has very simple semantics for
982  *      this, no blocking and very strange errors 8)
983  */
984
985 static int tcp_recv_urg(struct sock *sk, long timeo,
986                         struct msghdr *msg, int len, int flags,
987                         int *addr_len)
988 {
989         struct tcp_sock *tp = tcp_sk(sk);
990
991         /* No URG data to read. */
992         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
993             tp->urg_data == TCP_URG_READ)
994                 return -EINVAL; /* Yes this is right ! */
995
996         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
997                 return -ENOTCONN;
998
999         if (tp->urg_data & TCP_URG_VALID) {
1000                 int err = 0;
1001                 char c = tp->urg_data;
1002
1003                 if (!(flags & MSG_PEEK))
1004                         tp->urg_data = TCP_URG_READ;
1005
1006                 /* Read urgent data. */
1007                 msg->msg_flags |= MSG_OOB;
1008
1009                 if (len > 0) {
1010                         if (!(flags & MSG_TRUNC))
1011                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1012                         len = 1;
1013                 } else
1014                         msg->msg_flags |= MSG_TRUNC;
1015
1016                 return err ? -EFAULT : len;
1017         }
1018
1019         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1020                 return 0;
1021
1022         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1023          * the available implementations agree in this case:
1024          * this call should never block, independent of the
1025          * blocking state of the socket.
1026          * Mike <pall@rz.uni-karlsruhe.de>
1027          */
1028         return -EAGAIN;
1029 }
1030
1031 /* Clean up the receive buffer for full frames taken by the user,
1032  * then send an ACK if necessary.  COPIED is the number of bytes
1033  * tcp_recvmsg has given to the user so far, it speeds up the
1034  * calculation of whether or not we must ACK for the sake of
1035  * a window update.
1036  */
1037 static void cleanup_rbuf(struct sock *sk, int copied)
1038 {
1039         struct tcp_sock *tp = tcp_sk(sk);
1040         int time_to_ack = 0;
1041
1042 #if TCP_DEBUG
1043         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1044
1045         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1046 #endif
1047
1048         if (tcp_ack_scheduled(tp)) {
1049                    /* Delayed ACKs frequently hit locked sockets during bulk
1050                     * receive. */
1051                 if (tp->ack.blocked ||
1052                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1053                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1054                     /*
1055                      * If this read emptied read buffer, we send ACK, if
1056                      * connection is not bidirectional, user drained
1057                      * receive buffer and there was a small segment
1058                      * in queue.
1059                      */
1060                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1061                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1062                         time_to_ack = 1;
1063         }
1064
1065         /* We send an ACK if we can now advertise a non-zero window
1066          * which has been raised "significantly".
1067          *
1068          * Even if window raised up to infinity, do not send window open ACK
1069          * in states, where we will not receive more. It is useless.
1070          */
1071         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1072                 __u32 rcv_window_now = tcp_receive_window(tp);
1073
1074                 /* Optimize, __tcp_select_window() is not cheap. */
1075                 if (2*rcv_window_now <= tp->window_clamp) {
1076                         __u32 new_window = __tcp_select_window(sk);
1077
1078                         /* Send ACK now, if this read freed lots of space
1079                          * in our buffer. Certainly, new_window is new window.
1080                          * We can advertise it now, if it is not less than current one.
1081                          * "Lots" means "at least twice" here.
1082                          */
1083                         if (new_window && new_window >= 2 * rcv_window_now)
1084                                 time_to_ack = 1;
1085                 }
1086         }
1087         if (time_to_ack)
1088                 tcp_send_ack(sk);
1089 }
1090
1091 static void tcp_prequeue_process(struct sock *sk)
1092 {
1093         struct sk_buff *skb;
1094         struct tcp_sock *tp = tcp_sk(sk);
1095
1096         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1097
1098         /* RX process wants to run with disabled BHs, though it is not
1099          * necessary */
1100         local_bh_disable();
1101         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1102                 sk->sk_backlog_rcv(sk, skb);
1103         local_bh_enable();
1104
1105         /* Clear memory counter. */
1106         tp->ucopy.memory = 0;
1107 }
1108
1109 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1110 {
1111         struct sk_buff *skb;
1112         u32 offset;
1113
1114         skb_queue_walk(&sk->sk_receive_queue, skb) {
1115                 offset = seq - TCP_SKB_CB(skb)->seq;
1116                 if (skb->h.th->syn)
1117                         offset--;
1118                 if (offset < skb->len || skb->h.th->fin) {
1119                         *off = offset;
1120                         return skb;
1121                 }
1122         }
1123         return NULL;
1124 }
1125
1126 /*
1127  * This routine provides an alternative to tcp_recvmsg() for routines
1128  * that would like to handle copying from skbuffs directly in 'sendfile'
1129  * fashion.
1130  * Note:
1131  *      - It is assumed that the socket was locked by the caller.
1132  *      - The routine does not block.
1133  *      - At present, there is no support for reading OOB data
1134  *        or for 'peeking' the socket using this routine
1135  *        (although both would be easy to implement).
1136  */
1137 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1138                   sk_read_actor_t recv_actor)
1139 {
1140         struct sk_buff *skb;
1141         struct tcp_sock *tp = tcp_sk(sk);
1142         u32 seq = tp->copied_seq;
1143         u32 offset;
1144         int copied = 0;
1145
1146         if (sk->sk_state == TCP_LISTEN)
1147                 return -ENOTCONN;
1148         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1149                 if (offset < skb->len) {
1150                         size_t used, len;
1151
1152                         len = skb->len - offset;
1153                         /* Stop reading if we hit a patch of urgent data */
1154                         if (tp->urg_data) {
1155                                 u32 urg_offset = tp->urg_seq - seq;
1156                                 if (urg_offset < len)
1157                                         len = urg_offset;
1158                                 if (!len)
1159                                         break;
1160                         }
1161                         used = recv_actor(desc, skb, offset, len);
1162                         if (used <= len) {
1163                                 seq += used;
1164                                 copied += used;
1165                                 offset += used;
1166                         }
1167                         if (offset != skb->len)
1168                                 break;
1169                 }
1170                 if (skb->h.th->fin) {
1171                         sk_eat_skb(sk, skb);
1172                         ++seq;
1173                         break;
1174                 }
1175                 sk_eat_skb(sk, skb);
1176                 if (!desc->count)
1177                         break;
1178         }
1179         tp->copied_seq = seq;
1180
1181         tcp_rcv_space_adjust(sk);
1182
1183         /* Clean up data we have read: This will do ACK frames. */
1184         if (copied)
1185                 cleanup_rbuf(sk, copied);
1186         return copied;
1187 }
1188
1189 /*
1190  *      This routine copies from a sock struct into the user buffer.
1191  *
1192  *      Technical note: in 2.3 we work on _locked_ socket, so that
1193  *      tricks with *seq access order and skb->users are not required.
1194  *      Probably, code can be easily improved even more.
1195  */
1196
1197 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1198                 size_t len, int nonblock, int flags, int *addr_len)
1199 {
1200         struct tcp_sock *tp = tcp_sk(sk);
1201         int copied = 0;
1202         u32 peek_seq;
1203         u32 *seq;
1204         unsigned long used;
1205         int err;
1206         int target;             /* Read at least this many bytes */
1207         long timeo;
1208         struct task_struct *user_recv = NULL;
1209
1210         lock_sock(sk);
1211
1212         TCP_CHECK_TIMER(sk);
1213
1214         err = -ENOTCONN;
1215         if (sk->sk_state == TCP_LISTEN)
1216                 goto out;
1217
1218         timeo = sock_rcvtimeo(sk, nonblock);
1219
1220         /* Urgent data needs to be handled specially. */
1221         if (flags & MSG_OOB)
1222                 goto recv_urg;
1223
1224         seq = &tp->copied_seq;
1225         if (flags & MSG_PEEK) {
1226                 peek_seq = tp->copied_seq;
1227                 seq = &peek_seq;
1228         }
1229
1230         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1231
1232         do {
1233                 struct sk_buff *skb;
1234                 u32 offset;
1235
1236                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1237                 if (tp->urg_data && tp->urg_seq == *seq) {
1238                         if (copied)
1239                                 break;
1240                         if (signal_pending(current)) {
1241                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1242                                 break;
1243                         }
1244                 }
1245
1246                 /* Next get a buffer. */
1247
1248                 skb = skb_peek(&sk->sk_receive_queue);
1249                 do {
1250                         if (!skb)
1251                                 break;
1252
1253                         /* Now that we have two receive queues this
1254                          * shouldn't happen.
1255                          */
1256                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1257                                 printk(KERN_INFO "recvmsg bug: copied %X "
1258                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1259                                 break;
1260                         }
1261                         offset = *seq - TCP_SKB_CB(skb)->seq;
1262                         if (skb->h.th->syn)
1263                                 offset--;
1264                         if (offset < skb->len)
1265                                 goto found_ok_skb;
1266                         if (skb->h.th->fin)
1267                                 goto found_fin_ok;
1268                         BUG_TRAP(flags & MSG_PEEK);
1269                         skb = skb->next;
1270                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1271
1272                 /* Well, if we have backlog, try to process it now yet. */
1273
1274                 if (copied >= target && !sk->sk_backlog.tail)
1275                         break;
1276
1277                 if (copied) {
1278                         if (sk->sk_err ||
1279                             sk->sk_state == TCP_CLOSE ||
1280                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1281                             !timeo ||
1282                             signal_pending(current) ||
1283                             (flags & MSG_PEEK))
1284                                 break;
1285                 } else {
1286                         if (sock_flag(sk, SOCK_DONE))
1287                                 break;
1288
1289                         if (sk->sk_err) {
1290                                 copied = sock_error(sk);
1291                                 break;
1292                         }
1293
1294                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1295                                 break;
1296
1297                         if (sk->sk_state == TCP_CLOSE) {
1298                                 if (!sock_flag(sk, SOCK_DONE)) {
1299                                         /* This occurs when user tries to read
1300                                          * from never connected socket.
1301                                          */
1302                                         copied = -ENOTCONN;
1303                                         break;
1304                                 }
1305                                 break;
1306                         }
1307
1308                         if (!timeo) {
1309                                 copied = -EAGAIN;
1310                                 break;
1311                         }
1312
1313                         if (signal_pending(current)) {
1314                                 copied = sock_intr_errno(timeo);
1315                                 break;
1316                         }
1317                 }
1318
1319                 cleanup_rbuf(sk, copied);
1320
1321                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1322                         /* Install new reader */
1323                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1324                                 user_recv = current;
1325                                 tp->ucopy.task = user_recv;
1326                                 tp->ucopy.iov = msg->msg_iov;
1327                         }
1328
1329                         tp->ucopy.len = len;
1330
1331                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1332                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1333
1334                         /* Ugly... If prequeue is not empty, we have to
1335                          * process it before releasing socket, otherwise
1336                          * order will be broken at second iteration.
1337                          * More elegant solution is required!!!
1338                          *
1339                          * Look: we have the following (pseudo)queues:
1340                          *
1341                          * 1. packets in flight
1342                          * 2. backlog
1343                          * 3. prequeue
1344                          * 4. receive_queue
1345                          *
1346                          * Each queue can be processed only if the next ones
1347                          * are empty. At this point we have empty receive_queue.
1348                          * But prequeue _can_ be not empty after 2nd iteration,
1349                          * when we jumped to start of loop because backlog
1350                          * processing added something to receive_queue.
1351                          * We cannot release_sock(), because backlog contains
1352                          * packets arrived _after_ prequeued ones.
1353                          *
1354                          * Shortly, algorithm is clear --- to process all
1355                          * the queues in order. We could make it more directly,
1356                          * requeueing packets from backlog to prequeue, if
1357                          * is not empty. It is more elegant, but eats cycles,
1358                          * unfortunately.
1359                          */
1360                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1361                                 goto do_prequeue;
1362
1363                         /* __ Set realtime policy in scheduler __ */
1364                 }
1365
1366                 if (copied >= target) {
1367                         /* Do not sleep, just process backlog. */
1368                         release_sock(sk);
1369                         lock_sock(sk);
1370                 } else
1371                         sk_wait_data(sk, &timeo);
1372
1373                 if (user_recv) {
1374                         int chunk;
1375
1376                         /* __ Restore normal policy in scheduler __ */
1377
1378                         if ((chunk = len - tp->ucopy.len) != 0) {
1379                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1380                                 len -= chunk;
1381                                 copied += chunk;
1382                         }
1383
1384                         if (tp->rcv_nxt == tp->copied_seq &&
1385                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1386 do_prequeue:
1387                                 tcp_prequeue_process(sk);
1388
1389                                 if ((chunk = len - tp->ucopy.len) != 0) {
1390                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1391                                         len -= chunk;
1392                                         copied += chunk;
1393                                 }
1394                         }
1395                 }
1396                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1397                         if (net_ratelimit())
1398                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1399                                        current->comm, current->pid);
1400                         peek_seq = tp->copied_seq;
1401                 }
1402                 continue;
1403
1404         found_ok_skb:
1405                 /* Ok so how much can we use? */
1406                 used = skb->len - offset;
1407                 if (len < used)
1408                         used = len;
1409
1410                 /* Do we have urgent data here? */
1411                 if (tp->urg_data) {
1412                         u32 urg_offset = tp->urg_seq - *seq;
1413                         if (urg_offset < used) {
1414                                 if (!urg_offset) {
1415                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1416                                                 ++*seq;
1417                                                 offset++;
1418                                                 used--;
1419                                                 if (!used)
1420                                                         goto skip_copy;
1421                                         }
1422                                 } else
1423                                         used = urg_offset;
1424                         }
1425                 }
1426
1427                 if (!(flags & MSG_TRUNC)) {
1428                         err = skb_copy_datagram_iovec(skb, offset,
1429                                                       msg->msg_iov, used);
1430                         if (err) {
1431                                 /* Exception. Bailout! */
1432                                 if (!copied)
1433                                         copied = -EFAULT;
1434                                 break;
1435                         }
1436                 }
1437
1438                 *seq += used;
1439                 copied += used;
1440                 len -= used;
1441
1442                 tcp_rcv_space_adjust(sk);
1443
1444 skip_copy:
1445                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1446                         tp->urg_data = 0;
1447                         tcp_fast_path_check(sk, tp);
1448                 }
1449                 if (used + offset < skb->len)
1450                         continue;
1451
1452                 if (skb->h.th->fin)
1453                         goto found_fin_ok;
1454                 if (!(flags & MSG_PEEK))
1455                         sk_eat_skb(sk, skb);
1456                 continue;
1457
1458         found_fin_ok:
1459                 /* Process the FIN. */
1460                 ++*seq;
1461                 if (!(flags & MSG_PEEK))
1462                         sk_eat_skb(sk, skb);
1463                 break;
1464         } while (len > 0);
1465
1466         if (user_recv) {
1467                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1468                         int chunk;
1469
1470                         tp->ucopy.len = copied > 0 ? len : 0;
1471
1472                         tcp_prequeue_process(sk);
1473
1474                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1475                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1476                                 len -= chunk;
1477                                 copied += chunk;
1478                         }
1479                 }
1480
1481                 tp->ucopy.task = NULL;
1482                 tp->ucopy.len = 0;
1483         }
1484
1485         /* According to UNIX98, msg_name/msg_namelen are ignored
1486          * on connected socket. I was just happy when found this 8) --ANK
1487          */
1488
1489         /* Clean up data we have read: This will do ACK frames. */
1490         cleanup_rbuf(sk, copied);
1491
1492         TCP_CHECK_TIMER(sk);
1493         release_sock(sk);
1494         return copied;
1495
1496 out:
1497         TCP_CHECK_TIMER(sk);
1498         release_sock(sk);
1499         return err;
1500
1501 recv_urg:
1502         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1503         goto out;
1504 }
1505
1506 /*
1507  *      State processing on a close. This implements the state shift for
1508  *      sending our FIN frame. Note that we only send a FIN for some
1509  *      states. A shutdown() may have already sent the FIN, or we may be
1510  *      closed.
1511  */
1512
1513 static unsigned char new_state[16] = {
1514   /* current state:        new state:      action:      */
1515   /* (Invalid)          */ TCP_CLOSE,
1516   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1517   /* TCP_SYN_SENT       */ TCP_CLOSE,
1518   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1519   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1520   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1521   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1522   /* TCP_CLOSE          */ TCP_CLOSE,
1523   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1524   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1525   /* TCP_LISTEN         */ TCP_CLOSE,
1526   /* TCP_CLOSING        */ TCP_CLOSING,
1527 };
1528
1529 static int tcp_close_state(struct sock *sk)
1530 {
1531         int next = (int)new_state[sk->sk_state];
1532         int ns = next & TCP_STATE_MASK;
1533
1534         tcp_set_state(sk, ns);
1535
1536         return next & TCP_ACTION_FIN;
1537 }
1538
1539 /*
1540  *      Shutdown the sending side of a connection. Much like close except
1541  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1542  */
1543
1544 void tcp_shutdown(struct sock *sk, int how)
1545 {
1546         /*      We need to grab some memory, and put together a FIN,
1547          *      and then put it into the queue to be sent.
1548          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1549          */
1550         if (!(how & SEND_SHUTDOWN))
1551                 return;
1552
1553         /* If we've already sent a FIN, or it's a closed state, skip this. */
1554         if ((1 << sk->sk_state) &
1555             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1556              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1557                 /* Clear out any half completed packets.  FIN if needed. */
1558                 if (tcp_close_state(sk))
1559                         tcp_send_fin(sk);
1560         }
1561 }
1562
1563 /*
1564  * At this point, there should be no process reference to this
1565  * socket, and thus no user references at all.  Therefore we
1566  * can assume the socket waitqueue is inactive and nobody will
1567  * try to jump onto it.
1568  */
1569 void tcp_destroy_sock(struct sock *sk)
1570 {
1571         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1572         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1573
1574         /* It cannot be in hash table! */
1575         BUG_TRAP(sk_unhashed(sk));
1576
1577         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1578         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1579
1580         sk->sk_prot->destroy(sk);
1581
1582         sk_stream_kill_queues(sk);
1583
1584         xfrm_sk_free_policy(sk);
1585
1586         sk_refcnt_debug_release(sk);
1587
1588         atomic_dec(&tcp_orphan_count);
1589         sock_put(sk);
1590 }
1591
1592 void tcp_close(struct sock *sk, long timeout)
1593 {
1594         struct sk_buff *skb;
1595         int data_was_unread = 0;
1596
1597         lock_sock(sk);
1598         sk->sk_shutdown = SHUTDOWN_MASK;
1599
1600         if (sk->sk_state == TCP_LISTEN) {
1601                 tcp_set_state(sk, TCP_CLOSE);
1602
1603                 /* Special case. */
1604                 tcp_listen_stop(sk);
1605
1606                 goto adjudge_to_death;
1607         }
1608
1609         /*  We need to flush the recv. buffs.  We do this only on the
1610          *  descriptor close, not protocol-sourced closes, because the
1611          *  reader process may not have drained the data yet!
1612          */
1613         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1614                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1615                           skb->h.th->fin;
1616                 data_was_unread += len;
1617                 __kfree_skb(skb);
1618         }
1619
1620         sk_stream_mem_reclaim(sk);
1621
1622         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1623          * 3.10, we send a RST here because data was lost.  To
1624          * witness the awful effects of the old behavior of always
1625          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1626          * a bulk GET in an FTP client, suspend the process, wait
1627          * for the client to advertise a zero window, then kill -9
1628          * the FTP client, wheee...  Note: timeout is always zero
1629          * in such a case.
1630          */
1631         if (data_was_unread) {
1632                 /* Unread data was tossed, zap the connection. */
1633                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1634                 tcp_set_state(sk, TCP_CLOSE);
1635                 tcp_send_active_reset(sk, GFP_KERNEL);
1636         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1637                 /* Check zero linger _after_ checking for unread data. */
1638                 sk->sk_prot->disconnect(sk, 0);
1639                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1640         } else if (tcp_close_state(sk)) {
1641                 /* We FIN if the application ate all the data before
1642                  * zapping the connection.
1643                  */
1644
1645                 /* RED-PEN. Formally speaking, we have broken TCP state
1646                  * machine. State transitions:
1647                  *
1648                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1649                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1650                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1651                  *
1652                  * are legal only when FIN has been sent (i.e. in window),
1653                  * rather than queued out of window. Purists blame.
1654                  *
1655                  * F.e. "RFC state" is ESTABLISHED,
1656                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1657                  *
1658                  * The visible declinations are that sometimes
1659                  * we enter time-wait state, when it is not required really
1660                  * (harmless), do not send active resets, when they are
1661                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1662                  * they look as CLOSING or LAST_ACK for Linux)
1663                  * Probably, I missed some more holelets.
1664                  *                                              --ANK
1665                  */
1666                 tcp_send_fin(sk);
1667         }
1668
1669         sk_stream_wait_close(sk, timeout);
1670
1671 adjudge_to_death:
1672         /* It is the last release_sock in its life. It will remove backlog. */
1673         release_sock(sk);
1674
1675
1676         /* Now socket is owned by kernel and we acquire BH lock
1677            to finish close. No need to check for user refs.
1678          */
1679         local_bh_disable();
1680         bh_lock_sock(sk);
1681         BUG_TRAP(!sock_owned_by_user(sk));
1682
1683         sock_hold(sk);
1684         sock_orphan(sk);
1685
1686         /*      This is a (useful) BSD violating of the RFC. There is a
1687          *      problem with TCP as specified in that the other end could
1688          *      keep a socket open forever with no application left this end.
1689          *      We use a 3 minute timeout (about the same as BSD) then kill
1690          *      our end. If they send after that then tough - BUT: long enough
1691          *      that we won't make the old 4*rto = almost no time - whoops
1692          *      reset mistake.
1693          *
1694          *      Nope, it was not mistake. It is really desired behaviour
1695          *      f.e. on http servers, when such sockets are useless, but
1696          *      consume significant resources. Let's do it with special
1697          *      linger2 option.                                 --ANK
1698          */
1699
1700         if (sk->sk_state == TCP_FIN_WAIT2) {
1701                 struct tcp_sock *tp = tcp_sk(sk);
1702                 if (tp->linger2 < 0) {
1703                         tcp_set_state(sk, TCP_CLOSE);
1704                         tcp_send_active_reset(sk, GFP_ATOMIC);
1705                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1706                 } else {
1707                         int tmo = tcp_fin_time(tp);
1708
1709                         if (tmo > TCP_TIMEWAIT_LEN) {
1710                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1711                         } else {
1712                                 atomic_inc(&tcp_orphan_count);
1713                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1714                                 goto out;
1715                         }
1716                 }
1717         }
1718         if (sk->sk_state != TCP_CLOSE) {
1719                 sk_stream_mem_reclaim(sk);
1720                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1721                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1722                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1723                         if (net_ratelimit())
1724                                 printk(KERN_INFO "TCP: too many of orphaned "
1725                                        "sockets\n");
1726                         tcp_set_state(sk, TCP_CLOSE);
1727                         tcp_send_active_reset(sk, GFP_ATOMIC);
1728                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1729                 }
1730         }
1731         atomic_inc(&tcp_orphan_count);
1732
1733         if (sk->sk_state == TCP_CLOSE)
1734                 tcp_destroy_sock(sk);
1735         /* Otherwise, socket is reprieved until protocol close. */
1736
1737 out:
1738         bh_unlock_sock(sk);
1739         local_bh_enable();
1740         sock_put(sk);
1741 }
1742
1743 /* These states need RST on ABORT according to RFC793 */
1744
1745 static inline int tcp_need_reset(int state)
1746 {
1747         return (1 << state) &
1748                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1749                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1750 }
1751
1752 int tcp_disconnect(struct sock *sk, int flags)
1753 {
1754         struct inet_sock *inet = inet_sk(sk);
1755         struct tcp_sock *tp = tcp_sk(sk);
1756         int err = 0;
1757         int old_state = sk->sk_state;
1758
1759         if (old_state != TCP_CLOSE)
1760                 tcp_set_state(sk, TCP_CLOSE);
1761
1762         /* ABORT function of RFC793 */
1763         if (old_state == TCP_LISTEN) {
1764                 tcp_listen_stop(sk);
1765         } else if (tcp_need_reset(old_state) ||
1766                    (tp->snd_nxt != tp->write_seq &&
1767                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1768                 /* The last check adjusts for discrepance of Linux wrt. RFC
1769                  * states
1770                  */
1771                 tcp_send_active_reset(sk, gfp_any());
1772                 sk->sk_err = ECONNRESET;
1773         } else if (old_state == TCP_SYN_SENT)
1774                 sk->sk_err = ECONNRESET;
1775
1776         tcp_clear_xmit_timers(sk);
1777         __skb_queue_purge(&sk->sk_receive_queue);
1778         sk_stream_writequeue_purge(sk);
1779         __skb_queue_purge(&tp->out_of_order_queue);
1780
1781         inet->dport = 0;
1782
1783         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1784                 inet_reset_saddr(sk);
1785
1786         sk->sk_shutdown = 0;
1787         sock_reset_flag(sk, SOCK_DONE);
1788         tp->srtt = 0;
1789         if ((tp->write_seq += tp->max_window + 2) == 0)
1790                 tp->write_seq = 1;
1791         tp->backoff = 0;
1792         tp->snd_cwnd = 2;
1793         tp->probes_out = 0;
1794         tp->packets_out = 0;
1795         tp->snd_ssthresh = 0x7fffffff;
1796         tp->snd_cwnd_cnt = 0;
1797         tcp_set_ca_state(tp, TCP_CA_Open);
1798         tcp_clear_retrans(tp);
1799         tcp_delack_init(tp);
1800         sk->sk_send_head = NULL;
1801         tp->rx_opt.saw_tstamp = 0;
1802         tcp_sack_reset(&tp->rx_opt);
1803         __sk_dst_reset(sk);
1804
1805         BUG_TRAP(!inet->num || tp->bind_hash);
1806
1807         sk->sk_error_report(sk);
1808         return err;
1809 }
1810
1811 /*
1812  *      Wait for an incoming connection, avoid race
1813  *      conditions. This must be called with the socket locked.
1814  */
1815 static int wait_for_connect(struct sock *sk, long timeo)
1816 {
1817         struct tcp_sock *tp = tcp_sk(sk);
1818         DEFINE_WAIT(wait);
1819         int err;
1820
1821         /*
1822          * True wake-one mechanism for incoming connections: only
1823          * one process gets woken up, not the 'whole herd'.
1824          * Since we do not 'race & poll' for established sockets
1825          * anymore, the common case will execute the loop only once.
1826          *
1827          * Subtle issue: "add_wait_queue_exclusive()" will be added
1828          * after any current non-exclusive waiters, and we know that
1829          * it will always _stay_ after any new non-exclusive waiters
1830          * because all non-exclusive waiters are added at the
1831          * beginning of the wait-queue. As such, it's ok to "drop"
1832          * our exclusiveness temporarily when we get woken up without
1833          * having to remove and re-insert us on the wait queue.
1834          */
1835         for (;;) {
1836                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
1837                                           TASK_INTERRUPTIBLE);
1838                 release_sock(sk);
1839                 if (reqsk_queue_empty(&tp->accept_queue))
1840                         timeo = schedule_timeout(timeo);
1841                 lock_sock(sk);
1842                 err = 0;
1843                 if (!reqsk_queue_empty(&tp->accept_queue))
1844                         break;
1845                 err = -EINVAL;
1846                 if (sk->sk_state != TCP_LISTEN)
1847                         break;
1848                 err = sock_intr_errno(timeo);
1849                 if (signal_pending(current))
1850                         break;
1851                 err = -EAGAIN;
1852                 if (!timeo)
1853                         break;
1854         }
1855         finish_wait(sk->sk_sleep, &wait);
1856         return err;
1857 }
1858
1859 /*
1860  *      This will accept the next outstanding connection.
1861  */
1862
1863 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1864 {
1865         struct tcp_sock *tp = tcp_sk(sk);
1866         struct sock *newsk;
1867         int error;
1868
1869         lock_sock(sk);
1870
1871         /* We need to make sure that this socket is listening,
1872          * and that it has something pending.
1873          */
1874         error = -EINVAL;
1875         if (sk->sk_state != TCP_LISTEN)
1876                 goto out_err;
1877
1878         /* Find already established connection */
1879         if (reqsk_queue_empty(&tp->accept_queue)) {
1880                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1881
1882                 /* If this is a non blocking socket don't sleep */
1883                 error = -EAGAIN;
1884                 if (!timeo)
1885                         goto out_err;
1886
1887                 error = wait_for_connect(sk, timeo);
1888                 if (error)
1889                         goto out_err;
1890         }
1891
1892         newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
1893         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
1894 out:
1895         release_sock(sk);
1896         return newsk;
1897 out_err:
1898         newsk = NULL;
1899         *err = error;
1900         goto out;
1901 }
1902
1903 /*
1904  *      Socket option code for TCP.
1905  */
1906 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1907                    int optlen)
1908 {
1909         struct tcp_sock *tp = tcp_sk(sk);
1910         int val;
1911         int err = 0;
1912
1913         if (level != SOL_TCP)
1914                 return tp->af_specific->setsockopt(sk, level, optname,
1915                                                    optval, optlen);
1916
1917         /* This is a string value all the others are int's */
1918         if (optname == TCP_CONGESTION) {
1919                 char name[TCP_CA_NAME_MAX];
1920
1921                 if (optlen < 1)
1922                         return -EINVAL;
1923
1924                 val = strncpy_from_user(name, optval,
1925                                         min(TCP_CA_NAME_MAX-1, optlen));
1926                 if (val < 0)
1927                         return -EFAULT;
1928                 name[val] = 0;
1929
1930                 lock_sock(sk);
1931                 err = tcp_set_congestion_control(tp, name);
1932                 release_sock(sk);
1933                 return err;
1934         }
1935
1936         if (optlen < sizeof(int))
1937                 return -EINVAL;
1938
1939         if (get_user(val, (int __user *)optval))
1940                 return -EFAULT;
1941
1942         lock_sock(sk);
1943
1944         switch (optname) {
1945         case TCP_MAXSEG:
1946                 /* Values greater than interface MTU won't take effect. However
1947                  * at the point when this call is done we typically don't yet
1948                  * know which interface is going to be used */
1949                 if (val < 8 || val > MAX_TCP_WINDOW) {
1950                         err = -EINVAL;
1951                         break;
1952                 }
1953                 tp->rx_opt.user_mss = val;
1954                 break;
1955
1956         case TCP_NODELAY:
1957                 if (val) {
1958                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1959                          * this option on corked socket is remembered, but
1960                          * it is not activated until cork is cleared.
1961                          *
1962                          * However, when TCP_NODELAY is set we make
1963                          * an explicit push, which overrides even TCP_CORK
1964                          * for currently queued segments.
1965                          */
1966                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1967                         tcp_push_pending_frames(sk, tp);
1968                 } else {
1969                         tp->nonagle &= ~TCP_NAGLE_OFF;
1970                 }
1971                 break;
1972
1973         case TCP_CORK:
1974                 /* When set indicates to always queue non-full frames.
1975                  * Later the user clears this option and we transmit
1976                  * any pending partial frames in the queue.  This is
1977                  * meant to be used alongside sendfile() to get properly
1978                  * filled frames when the user (for example) must write
1979                  * out headers with a write() call first and then use
1980                  * sendfile to send out the data parts.
1981                  *
1982                  * TCP_CORK can be set together with TCP_NODELAY and it is
1983                  * stronger than TCP_NODELAY.
1984                  */
1985                 if (val) {
1986                         tp->nonagle |= TCP_NAGLE_CORK;
1987                 } else {
1988                         tp->nonagle &= ~TCP_NAGLE_CORK;
1989                         if (tp->nonagle&TCP_NAGLE_OFF)
1990                                 tp->nonagle |= TCP_NAGLE_PUSH;
1991                         tcp_push_pending_frames(sk, tp);
1992                 }
1993                 break;
1994
1995         case TCP_KEEPIDLE:
1996                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1997                         err = -EINVAL;
1998                 else {
1999                         tp->keepalive_time = val * HZ;
2000                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2001                             !((1 << sk->sk_state) &
2002                               (TCPF_CLOSE | TCPF_LISTEN))) {
2003                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2004                                 if (tp->keepalive_time > elapsed)
2005                                         elapsed = tp->keepalive_time - elapsed;
2006                                 else
2007                                         elapsed = 0;
2008                                 tcp_reset_keepalive_timer(sk, elapsed);
2009                         }
2010                 }
2011                 break;
2012         case TCP_KEEPINTVL:
2013                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2014                         err = -EINVAL;
2015                 else
2016                         tp->keepalive_intvl = val * HZ;
2017                 break;
2018         case TCP_KEEPCNT:
2019                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2020                         err = -EINVAL;
2021                 else
2022                         tp->keepalive_probes = val;
2023                 break;
2024         case TCP_SYNCNT:
2025                 if (val < 1 || val > MAX_TCP_SYNCNT)
2026                         err = -EINVAL;
2027                 else
2028                         tp->syn_retries = val;
2029                 break;
2030
2031         case TCP_LINGER2:
2032                 if (val < 0)
2033                         tp->linger2 = -1;
2034                 else if (val > sysctl_tcp_fin_timeout / HZ)
2035                         tp->linger2 = 0;
2036                 else
2037                         tp->linger2 = val * HZ;
2038                 break;
2039
2040         case TCP_DEFER_ACCEPT:
2041                 tp->defer_accept = 0;
2042                 if (val > 0) {
2043                         /* Translate value in seconds to number of
2044                          * retransmits */
2045                         while (tp->defer_accept < 32 &&
2046                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2047                                        tp->defer_accept))
2048                                 tp->defer_accept++;
2049                         tp->defer_accept++;
2050                 }
2051                 break;
2052
2053         case TCP_WINDOW_CLAMP:
2054                 if (!val) {
2055                         if (sk->sk_state != TCP_CLOSE) {
2056                                 err = -EINVAL;
2057                                 break;
2058                         }
2059                         tp->window_clamp = 0;
2060                 } else
2061                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2062                                                 SOCK_MIN_RCVBUF / 2 : val;
2063                 break;
2064
2065         case TCP_QUICKACK:
2066                 if (!val) {
2067                         tp->ack.pingpong = 1;
2068                 } else {
2069                         tp->ack.pingpong = 0;
2070                         if ((1 << sk->sk_state) &
2071                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2072                             tcp_ack_scheduled(tp)) {
2073                                 tp->ack.pending |= TCP_ACK_PUSHED;
2074                                 cleanup_rbuf(sk, 1);
2075                                 if (!(val & 1))
2076                                         tp->ack.pingpong = 1;
2077                         }
2078                 }
2079                 break;
2080
2081         default:
2082                 err = -ENOPROTOOPT;
2083                 break;
2084         };
2085         release_sock(sk);
2086         return err;
2087 }
2088
2089 /* Return information about state of tcp endpoint in API format. */
2090 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2091 {
2092         struct tcp_sock *tp = tcp_sk(sk);
2093         u32 now = tcp_time_stamp;
2094
2095         memset(info, 0, sizeof(*info));
2096
2097         info->tcpi_state = sk->sk_state;
2098         info->tcpi_ca_state = tp->ca_state;
2099         info->tcpi_retransmits = tp->retransmits;
2100         info->tcpi_probes = tp->probes_out;
2101         info->tcpi_backoff = tp->backoff;
2102
2103         if (tp->rx_opt.tstamp_ok)
2104                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2105         if (tp->rx_opt.sack_ok)
2106                 info->tcpi_options |= TCPI_OPT_SACK;
2107         if (tp->rx_opt.wscale_ok) {
2108                 info->tcpi_options |= TCPI_OPT_WSCALE;
2109                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2110                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2111         } 
2112
2113         if (tp->ecn_flags&TCP_ECN_OK)
2114                 info->tcpi_options |= TCPI_OPT_ECN;
2115
2116         info->tcpi_rto = jiffies_to_usecs(tp->rto);
2117         info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2118         info->tcpi_snd_mss = tp->mss_cache;
2119         info->tcpi_rcv_mss = tp->ack.rcv_mss;
2120
2121         info->tcpi_unacked = tp->packets_out;
2122         info->tcpi_sacked = tp->sacked_out;
2123         info->tcpi_lost = tp->lost_out;
2124         info->tcpi_retrans = tp->retrans_out;
2125         info->tcpi_fackets = tp->fackets_out;
2126
2127         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2128         info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
2129         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2130
2131         info->tcpi_pmtu = tp->pmtu_cookie;
2132         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2133         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2134         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2135         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2136         info->tcpi_snd_cwnd = tp->snd_cwnd;
2137         info->tcpi_advmss = tp->advmss;
2138         info->tcpi_reordering = tp->reordering;
2139
2140         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2141         info->tcpi_rcv_space = tp->rcvq_space.space;
2142
2143         info->tcpi_total_retrans = tp->total_retrans;
2144 }
2145
2146 EXPORT_SYMBOL_GPL(tcp_get_info);
2147
2148 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2149                    int __user *optlen)
2150 {
2151         struct tcp_sock *tp = tcp_sk(sk);
2152         int val, len;
2153
2154         if (level != SOL_TCP)
2155                 return tp->af_specific->getsockopt(sk, level, optname,
2156                                                    optval, optlen);
2157
2158         if (get_user(len, optlen))
2159                 return -EFAULT;
2160
2161         len = min_t(unsigned int, len, sizeof(int));
2162
2163         if (len < 0)
2164                 return -EINVAL;
2165
2166         switch (optname) {
2167         case TCP_MAXSEG:
2168                 val = tp->mss_cache;
2169                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2170                         val = tp->rx_opt.user_mss;
2171                 break;
2172         case TCP_NODELAY:
2173                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2174                 break;
2175         case TCP_CORK:
2176                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2177                 break;
2178         case TCP_KEEPIDLE:
2179                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2180                 break;
2181         case TCP_KEEPINTVL:
2182                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2183                 break;
2184         case TCP_KEEPCNT:
2185                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2186                 break;
2187         case TCP_SYNCNT:
2188                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2189                 break;
2190         case TCP_LINGER2:
2191                 val = tp->linger2;
2192                 if (val >= 0)
2193                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2194                 break;
2195         case TCP_DEFER_ACCEPT:
2196                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2197                                                (tp->defer_accept - 1));
2198                 break;
2199         case TCP_WINDOW_CLAMP:
2200                 val = tp->window_clamp;
2201                 break;
2202         case TCP_INFO: {
2203                 struct tcp_info info;
2204
2205                 if (get_user(len, optlen))
2206                         return -EFAULT;
2207
2208                 tcp_get_info(sk, &info);
2209
2210                 len = min_t(unsigned int, len, sizeof(info));
2211                 if (put_user(len, optlen))
2212                         return -EFAULT;
2213                 if (copy_to_user(optval, &info, len))
2214                         return -EFAULT;
2215                 return 0;
2216         }
2217         case TCP_QUICKACK:
2218                 val = !tp->ack.pingpong;
2219                 break;
2220
2221         case TCP_CONGESTION:
2222                 if (get_user(len, optlen))
2223                         return -EFAULT;
2224                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2225                 if (put_user(len, optlen))
2226                         return -EFAULT;
2227                 if (copy_to_user(optval, tp->ca_ops->name, len))
2228                         return -EFAULT;
2229                 return 0;
2230         default:
2231                 return -ENOPROTOOPT;
2232         };
2233
2234         if (put_user(len, optlen))
2235                 return -EFAULT;
2236         if (copy_to_user(optval, &val, len))
2237                 return -EFAULT;
2238         return 0;
2239 }
2240
2241
2242 extern void __skb_cb_too_small_for_tcp(int, int);
2243 extern struct tcp_congestion_ops tcp_reno;
2244
2245 static __initdata unsigned long thash_entries;
2246 static int __init set_thash_entries(char *str)
2247 {
2248         if (!str)
2249                 return 0;
2250         thash_entries = simple_strtoul(str, &str, 0);
2251         return 1;
2252 }
2253 __setup("thash_entries=", set_thash_entries);
2254
2255 void __init tcp_init(void)
2256 {
2257         struct sk_buff *skb = NULL;
2258         int order, i;
2259
2260         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2261                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2262                                            sizeof(skb->cb));
2263
2264         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2265                                               sizeof(struct inet_bind_bucket),
2266                                               0, SLAB_HWCACHE_ALIGN,
2267                                               NULL, NULL);
2268         if (!tcp_bucket_cachep)
2269                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2270
2271         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2272                                                 sizeof(struct tcp_tw_bucket),
2273                                                 0, SLAB_HWCACHE_ALIGN,
2274                                                 NULL, NULL);
2275         if (!tcp_timewait_cachep)
2276                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2277
2278         /* Size and allocate the main established and bind bucket
2279          * hash tables.
2280          *
2281          * The methodology is similar to that of the buffer cache.
2282          */
2283         tcp_ehash =
2284                 alloc_large_system_hash("TCP established",
2285                                         sizeof(struct inet_ehash_bucket),
2286                                         thash_entries,
2287                                         (num_physpages >= 128 * 1024) ?
2288                                                 (25 - PAGE_SHIFT) :
2289                                                 (27 - PAGE_SHIFT),
2290                                         HASH_HIGHMEM,
2291                                         &tcp_ehash_size,
2292                                         NULL,
2293                                         0);
2294         tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
2295         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2296                 rwlock_init(&tcp_ehash[i].lock);
2297                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2298         }
2299
2300         tcp_bhash =
2301                 alloc_large_system_hash("TCP bind",
2302                                         sizeof(struct inet_bind_hashbucket),
2303                                         tcp_ehash_size,
2304                                         (num_physpages >= 128 * 1024) ?
2305                                                 (25 - PAGE_SHIFT) :
2306                                                 (27 - PAGE_SHIFT),
2307                                         HASH_HIGHMEM,
2308                                         &tcp_bhash_size,
2309                                         NULL,
2310                                         64 * 1024);
2311         tcp_bhash_size = 1 << tcp_bhash_size;
2312         for (i = 0; i < tcp_bhash_size; i++) {
2313                 spin_lock_init(&tcp_bhash[i].lock);
2314                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2315         }
2316
2317         /* Try to be a bit smarter and adjust defaults depending
2318          * on available memory.
2319          */
2320         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2321                         (tcp_bhash_size * sizeof(struct inet_bind_hashbucket));
2322                         order++)
2323                 ;
2324         if (order >= 4) {
2325                 sysctl_local_port_range[0] = 32768;
2326                 sysctl_local_port_range[1] = 61000;
2327                 sysctl_tcp_max_tw_buckets = 180000;
2328                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2329                 sysctl_max_syn_backlog = 1024;
2330         } else if (order < 3) {
2331                 sysctl_local_port_range[0] = 1024 * (3 - order);
2332                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2333                 sysctl_tcp_max_orphans >>= (3 - order);
2334                 sysctl_max_syn_backlog = 128;
2335         }
2336         tcp_port_rover = sysctl_local_port_range[0] - 1;
2337
2338         sysctl_tcp_mem[0] =  768 << order;
2339         sysctl_tcp_mem[1] = 1024 << order;
2340         sysctl_tcp_mem[2] = 1536 << order;
2341
2342         if (order < 3) {
2343                 sysctl_tcp_wmem[2] = 64 * 1024;
2344                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2345                 sysctl_tcp_rmem[1] = 43689;
2346                 sysctl_tcp_rmem[2] = 2 * 43689;
2347         }
2348
2349         printk(KERN_INFO "TCP: Hash tables configured "
2350                "(established %d bind %d)\n",
2351                tcp_ehash_size << 1, tcp_bhash_size);
2352
2353         tcp_register_congestion_control(&tcp_reno);
2354 }
2355
2356 EXPORT_SYMBOL(tcp_accept);
2357 EXPORT_SYMBOL(tcp_close);
2358 EXPORT_SYMBOL(tcp_destroy_sock);
2359 EXPORT_SYMBOL(tcp_disconnect);
2360 EXPORT_SYMBOL(tcp_getsockopt);
2361 EXPORT_SYMBOL(tcp_ioctl);
2362 EXPORT_SYMBOL(tcp_poll);
2363 EXPORT_SYMBOL(tcp_read_sock);
2364 EXPORT_SYMBOL(tcp_recvmsg);
2365 EXPORT_SYMBOL(tcp_sendmsg);
2366 EXPORT_SYMBOL(tcp_sendpage);
2367 EXPORT_SYMBOL(tcp_setsockopt);
2368 EXPORT_SYMBOL(tcp_shutdown);
2369 EXPORT_SYMBOL(tcp_statistics);
2370 EXPORT_SYMBOL(tcp_timewait_cachep);