Merge git://git.kernel.org/pub/scm/linux/kernel/git/bunk/trivial
[linux-2.6] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114
115 #include <asm/uaccess.h>
116 #include <asm/system.h>
117
118 #include <linux/netdevice.h>
119 #include <net/protocol.h>
120 #include <linux/skbuff.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125
126 #include <linux/filter.h>
127
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131
132 /* Take into consideration the size of the struct sk_buff overhead in the
133  * determination of these values, since that is non-constant across
134  * platforms.  This makes socket queueing behavior and performance
135  * not depend upon such differences.
136  */
137 #define _SK_MEM_PACKETS         256
138 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
139 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141
142 /* Run time adjustable parameters. */
143 __u32 sysctl_wmem_max = SK_WMEM_MAX;
144 __u32 sysctl_rmem_max = SK_RMEM_MAX;
145 __u32 sysctl_wmem_default = SK_WMEM_MAX;
146 __u32 sysctl_rmem_default = SK_RMEM_MAX;
147
148 /* Maximal space eaten by iovec or ancilliary data plus some space */
149 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
150
151 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
152 {
153         struct timeval tv;
154
155         if (optlen < sizeof(tv))
156                 return -EINVAL;
157         if (copy_from_user(&tv, optval, sizeof(tv)))
158                 return -EFAULT;
159
160         *timeo_p = MAX_SCHEDULE_TIMEOUT;
161         if (tv.tv_sec == 0 && tv.tv_usec == 0)
162                 return 0;
163         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
164                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
165         return 0;
166 }
167
168 static void sock_warn_obsolete_bsdism(const char *name)
169 {
170         static int warned;
171         static char warncomm[TASK_COMM_LEN];
172         if (strcmp(warncomm, current->comm) && warned < 5) { 
173                 strcpy(warncomm,  current->comm); 
174                 printk(KERN_WARNING "process `%s' is using obsolete "
175                        "%s SO_BSDCOMPAT\n", warncomm, name);
176                 warned++;
177         }
178 }
179
180 static void sock_disable_timestamp(struct sock *sk)
181 {       
182         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
183                 sock_reset_flag(sk, SOCK_TIMESTAMP);
184                 net_disable_timestamp();
185         }
186 }
187
188
189 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
190 {
191         int err = 0;
192         int skb_len;
193
194         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
195            number of warnings when compiling with -W --ANK
196          */
197         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
198             (unsigned)sk->sk_rcvbuf) {
199                 err = -ENOMEM;
200                 goto out;
201         }
202
203         /* It would be deadlock, if sock_queue_rcv_skb is used
204            with socket lock! We assume that users of this
205            function are lock free.
206         */
207         err = sk_filter(sk, skb, 1);
208         if (err)
209                 goto out;
210
211         skb->dev = NULL;
212         skb_set_owner_r(skb, sk);
213
214         /* Cache the SKB length before we tack it onto the receive
215          * queue.  Once it is added it no longer belongs to us and
216          * may be freed by other threads of control pulling packets
217          * from the queue.
218          */
219         skb_len = skb->len;
220
221         skb_queue_tail(&sk->sk_receive_queue, skb);
222
223         if (!sock_flag(sk, SOCK_DEAD))
224                 sk->sk_data_ready(sk, skb_len);
225 out:
226         return err;
227 }
228 EXPORT_SYMBOL(sock_queue_rcv_skb);
229
230 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
231 {
232         int rc = NET_RX_SUCCESS;
233
234         if (sk_filter(sk, skb, 0))
235                 goto discard_and_relse;
236
237         skb->dev = NULL;
238
239         bh_lock_sock(sk);
240         if (!sock_owned_by_user(sk))
241                 rc = sk->sk_backlog_rcv(sk, skb);
242         else
243                 sk_add_backlog(sk, skb);
244         bh_unlock_sock(sk);
245 out:
246         sock_put(sk);
247         return rc;
248 discard_and_relse:
249         kfree_skb(skb);
250         goto out;
251 }
252 EXPORT_SYMBOL(sk_receive_skb);
253
254 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
255 {
256         struct dst_entry *dst = sk->sk_dst_cache;
257
258         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
259                 sk->sk_dst_cache = NULL;
260                 dst_release(dst);
261                 return NULL;
262         }
263
264         return dst;
265 }
266 EXPORT_SYMBOL(__sk_dst_check);
267
268 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
269 {
270         struct dst_entry *dst = sk_dst_get(sk);
271
272         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
273                 sk_dst_reset(sk);
274                 dst_release(dst);
275                 return NULL;
276         }
277
278         return dst;
279 }
280 EXPORT_SYMBOL(sk_dst_check);
281
282 /*
283  *      This is meant for all protocols to use and covers goings on
284  *      at the socket level. Everything here is generic.
285  */
286
287 int sock_setsockopt(struct socket *sock, int level, int optname,
288                     char __user *optval, int optlen)
289 {
290         struct sock *sk=sock->sk;
291         struct sk_filter *filter;
292         int val;
293         int valbool;
294         struct linger ling;
295         int ret = 0;
296         
297         /*
298          *      Options without arguments
299          */
300
301 #ifdef SO_DONTLINGER            /* Compatibility item... */
302         if (optname == SO_DONTLINGER) {
303                 lock_sock(sk);
304                 sock_reset_flag(sk, SOCK_LINGER);
305                 release_sock(sk);
306                 return 0;
307         }
308 #endif
309         
310         if(optlen<sizeof(int))
311                 return(-EINVAL);
312         
313         if (get_user(val, (int __user *)optval))
314                 return -EFAULT;
315         
316         valbool = val?1:0;
317
318         lock_sock(sk);
319
320         switch(optname) 
321         {
322                 case SO_DEBUG:  
323                         if(val && !capable(CAP_NET_ADMIN))
324                         {
325                                 ret = -EACCES;
326                         }
327                         else if (valbool)
328                                 sock_set_flag(sk, SOCK_DBG);
329                         else
330                                 sock_reset_flag(sk, SOCK_DBG);
331                         break;
332                 case SO_REUSEADDR:
333                         sk->sk_reuse = valbool;
334                         break;
335                 case SO_TYPE:
336                 case SO_ERROR:
337                         ret = -ENOPROTOOPT;
338                         break;
339                 case SO_DONTROUTE:
340                         if (valbool)
341                                 sock_set_flag(sk, SOCK_LOCALROUTE);
342                         else
343                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
344                         break;
345                 case SO_BROADCAST:
346                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
347                         break;
348                 case SO_SNDBUF:
349                         /* Don't error on this BSD doesn't and if you think
350                            about it this is right. Otherwise apps have to
351                            play 'guess the biggest size' games. RCVBUF/SNDBUF
352                            are treated in BSD as hints */
353                            
354                         if (val > sysctl_wmem_max)
355                                 val = sysctl_wmem_max;
356 set_sndbuf:
357                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
358                         if ((val * 2) < SOCK_MIN_SNDBUF)
359                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
360                         else
361                                 sk->sk_sndbuf = val * 2;
362
363                         /*
364                          *      Wake up sending tasks if we
365                          *      upped the value.
366                          */
367                         sk->sk_write_space(sk);
368                         break;
369
370                 case SO_SNDBUFFORCE:
371                         if (!capable(CAP_NET_ADMIN)) {
372                                 ret = -EPERM;
373                                 break;
374                         }
375                         goto set_sndbuf;
376
377                 case SO_RCVBUF:
378                         /* Don't error on this BSD doesn't and if you think
379                            about it this is right. Otherwise apps have to
380                            play 'guess the biggest size' games. RCVBUF/SNDBUF
381                            are treated in BSD as hints */
382                           
383                         if (val > sysctl_rmem_max)
384                                 val = sysctl_rmem_max;
385 set_rcvbuf:
386                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
387                         /*
388                          * We double it on the way in to account for
389                          * "struct sk_buff" etc. overhead.   Applications
390                          * assume that the SO_RCVBUF setting they make will
391                          * allow that much actual data to be received on that
392                          * socket.
393                          *
394                          * Applications are unaware that "struct sk_buff" and
395                          * other overheads allocate from the receive buffer
396                          * during socket buffer allocation.
397                          *
398                          * And after considering the possible alternatives,
399                          * returning the value we actually used in getsockopt
400                          * is the most desirable behavior.
401                          */
402                         if ((val * 2) < SOCK_MIN_RCVBUF)
403                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
404                         else
405                                 sk->sk_rcvbuf = val * 2;
406                         break;
407
408                 case SO_RCVBUFFORCE:
409                         if (!capable(CAP_NET_ADMIN)) {
410                                 ret = -EPERM;
411                                 break;
412                         }
413                         goto set_rcvbuf;
414
415                 case SO_KEEPALIVE:
416 #ifdef CONFIG_INET
417                         if (sk->sk_protocol == IPPROTO_TCP)
418                                 tcp_set_keepalive(sk, valbool);
419 #endif
420                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
421                         break;
422
423                 case SO_OOBINLINE:
424                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
425                         break;
426
427                 case SO_NO_CHECK:
428                         sk->sk_no_check = valbool;
429                         break;
430
431                 case SO_PRIORITY:
432                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
433                                 sk->sk_priority = val;
434                         else
435                                 ret = -EPERM;
436                         break;
437
438                 case SO_LINGER:
439                         if(optlen<sizeof(ling)) {
440                                 ret = -EINVAL;  /* 1003.1g */
441                                 break;
442                         }
443                         if (copy_from_user(&ling,optval,sizeof(ling))) {
444                                 ret = -EFAULT;
445                                 break;
446                         }
447                         if (!ling.l_onoff)
448                                 sock_reset_flag(sk, SOCK_LINGER);
449                         else {
450 #if (BITS_PER_LONG == 32)
451                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
452                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
453                                 else
454 #endif
455                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
456                                 sock_set_flag(sk, SOCK_LINGER);
457                         }
458                         break;
459
460                 case SO_BSDCOMPAT:
461                         sock_warn_obsolete_bsdism("setsockopt");
462                         break;
463
464                 case SO_PASSCRED:
465                         if (valbool)
466                                 set_bit(SOCK_PASSCRED, &sock->flags);
467                         else
468                                 clear_bit(SOCK_PASSCRED, &sock->flags);
469                         break;
470
471                 case SO_TIMESTAMP:
472                         if (valbool)  {
473                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
474                                 sock_enable_timestamp(sk);
475                         } else
476                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
477                         break;
478
479                 case SO_RCVLOWAT:
480                         if (val < 0)
481                                 val = INT_MAX;
482                         sk->sk_rcvlowat = val ? : 1;
483                         break;
484
485                 case SO_RCVTIMEO:
486                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
487                         break;
488
489                 case SO_SNDTIMEO:
490                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
491                         break;
492
493 #ifdef CONFIG_NETDEVICES
494                 case SO_BINDTODEVICE:
495                 {
496                         char devname[IFNAMSIZ]; 
497
498                         /* Sorry... */ 
499                         if (!capable(CAP_NET_RAW)) {
500                                 ret = -EPERM;
501                                 break;
502                         }
503
504                         /* Bind this socket to a particular device like "eth0",
505                          * as specified in the passed interface name. If the
506                          * name is "" or the option length is zero the socket 
507                          * is not bound. 
508                          */ 
509
510                         if (!valbool) {
511                                 sk->sk_bound_dev_if = 0;
512                         } else {
513                                 if (optlen > IFNAMSIZ - 1)
514                                         optlen = IFNAMSIZ - 1;
515                                 memset(devname, 0, sizeof(devname));
516                                 if (copy_from_user(devname, optval, optlen)) {
517                                         ret = -EFAULT;
518                                         break;
519                                 }
520
521                                 /* Remove any cached route for this socket. */
522                                 sk_dst_reset(sk);
523
524                                 if (devname[0] == '\0') {
525                                         sk->sk_bound_dev_if = 0;
526                                 } else {
527                                         struct net_device *dev = dev_get_by_name(devname);
528                                         if (!dev) {
529                                                 ret = -ENODEV;
530                                                 break;
531                                         }
532                                         sk->sk_bound_dev_if = dev->ifindex;
533                                         dev_put(dev);
534                                 }
535                         }
536                         break;
537                 }
538 #endif
539
540
541                 case SO_ATTACH_FILTER:
542                         ret = -EINVAL;
543                         if (optlen == sizeof(struct sock_fprog)) {
544                                 struct sock_fprog fprog;
545
546                                 ret = -EFAULT;
547                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
548                                         break;
549
550                                 ret = sk_attach_filter(&fprog, sk);
551                         }
552                         break;
553
554                 case SO_DETACH_FILTER:
555                         spin_lock_bh(&sk->sk_lock.slock);
556                         filter = sk->sk_filter;
557                         if (filter) {
558                                 sk->sk_filter = NULL;
559                                 spin_unlock_bh(&sk->sk_lock.slock);
560                                 sk_filter_release(sk, filter);
561                                 break;
562                         }
563                         spin_unlock_bh(&sk->sk_lock.slock);
564                         ret = -ENONET;
565                         break;
566
567                 case SO_PASSSEC:
568                         if (valbool)
569                                 set_bit(SOCK_PASSSEC, &sock->flags);
570                         else
571                                 clear_bit(SOCK_PASSSEC, &sock->flags);
572                         break;
573
574                 /* We implement the SO_SNDLOWAT etc to
575                    not be settable (1003.1g 5.3) */
576                 default:
577                         ret = -ENOPROTOOPT;
578                         break;
579         }
580         release_sock(sk);
581         return ret;
582 }
583
584
585 int sock_getsockopt(struct socket *sock, int level, int optname,
586                     char __user *optval, int __user *optlen)
587 {
588         struct sock *sk = sock->sk;
589         
590         union
591         {
592                 int val;
593                 struct linger ling;
594                 struct timeval tm;
595         } v;
596         
597         unsigned int lv = sizeof(int);
598         int len;
599         
600         if(get_user(len,optlen))
601                 return -EFAULT;
602         if(len < 0)
603                 return -EINVAL;
604                 
605         switch(optname) 
606         {
607                 case SO_DEBUG:          
608                         v.val = sock_flag(sk, SOCK_DBG);
609                         break;
610                 
611                 case SO_DONTROUTE:
612                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
613                         break;
614                 
615                 case SO_BROADCAST:
616                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
617                         break;
618
619                 case SO_SNDBUF:
620                         v.val = sk->sk_sndbuf;
621                         break;
622                 
623                 case SO_RCVBUF:
624                         v.val = sk->sk_rcvbuf;
625                         break;
626
627                 case SO_REUSEADDR:
628                         v.val = sk->sk_reuse;
629                         break;
630
631                 case SO_KEEPALIVE:
632                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
633                         break;
634
635                 case SO_TYPE:
636                         v.val = sk->sk_type;                            
637                         break;
638
639                 case SO_ERROR:
640                         v.val = -sock_error(sk);
641                         if(v.val==0)
642                                 v.val = xchg(&sk->sk_err_soft, 0);
643                         break;
644
645                 case SO_OOBINLINE:
646                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
647                         break;
648         
649                 case SO_NO_CHECK:
650                         v.val = sk->sk_no_check;
651                         break;
652
653                 case SO_PRIORITY:
654                         v.val = sk->sk_priority;
655                         break;
656                 
657                 case SO_LINGER: 
658                         lv              = sizeof(v.ling);
659                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
660                         v.ling.l_linger = sk->sk_lingertime / HZ;
661                         break;
662                                         
663                 case SO_BSDCOMPAT:
664                         sock_warn_obsolete_bsdism("getsockopt");
665                         break;
666
667                 case SO_TIMESTAMP:
668                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
669                         break;
670
671                 case SO_RCVTIMEO:
672                         lv=sizeof(struct timeval);
673                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
674                                 v.tm.tv_sec = 0;
675                                 v.tm.tv_usec = 0;
676                         } else {
677                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
678                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
679                         }
680                         break;
681
682                 case SO_SNDTIMEO:
683                         lv=sizeof(struct timeval);
684                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
685                                 v.tm.tv_sec = 0;
686                                 v.tm.tv_usec = 0;
687                         } else {
688                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
689                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
690                         }
691                         break;
692
693                 case SO_RCVLOWAT:
694                         v.val = sk->sk_rcvlowat;
695                         break;
696
697                 case SO_SNDLOWAT:
698                         v.val=1;
699                         break; 
700
701                 case SO_PASSCRED:
702                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
703                         break;
704
705                 case SO_PEERCRED:
706                         if (len > sizeof(sk->sk_peercred))
707                                 len = sizeof(sk->sk_peercred);
708                         if (copy_to_user(optval, &sk->sk_peercred, len))
709                                 return -EFAULT;
710                         goto lenout;
711
712                 case SO_PEERNAME:
713                 {
714                         char address[128];
715
716                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
717                                 return -ENOTCONN;
718                         if (lv < len)
719                                 return -EINVAL;
720                         if (copy_to_user(optval, address, len))
721                                 return -EFAULT;
722                         goto lenout;
723                 }
724
725                 /* Dubious BSD thing... Probably nobody even uses it, but
726                  * the UNIX standard wants it for whatever reason... -DaveM
727                  */
728                 case SO_ACCEPTCONN:
729                         v.val = sk->sk_state == TCP_LISTEN;
730                         break;
731
732                 case SO_PASSSEC:
733                         v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
734                         break;
735
736                 case SO_PEERSEC:
737                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
738
739                 default:
740                         return(-ENOPROTOOPT);
741         }
742         if (len > lv)
743                 len = lv;
744         if (copy_to_user(optval, &v, len))
745                 return -EFAULT;
746 lenout:
747         if (put_user(len, optlen))
748                 return -EFAULT;
749         return 0;
750 }
751
752 /**
753  *      sk_alloc - All socket objects are allocated here
754  *      @family: protocol family
755  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
756  *      @prot: struct proto associated with this new sock instance
757  *      @zero_it: if we should zero the newly allocated sock
758  */
759 struct sock *sk_alloc(int family, gfp_t priority,
760                       struct proto *prot, int zero_it)
761 {
762         struct sock *sk = NULL;
763         kmem_cache_t *slab = prot->slab;
764
765         if (slab != NULL)
766                 sk = kmem_cache_alloc(slab, priority);
767         else
768                 sk = kmalloc(prot->obj_size, priority);
769
770         if (sk) {
771                 if (zero_it) {
772                         memset(sk, 0, prot->obj_size);
773                         sk->sk_family = family;
774                         /*
775                          * See comment in struct sock definition to understand
776                          * why we need sk_prot_creator -acme
777                          */
778                         sk->sk_prot = sk->sk_prot_creator = prot;
779                         sock_lock_init(sk);
780                 }
781                 
782                 if (security_sk_alloc(sk, family, priority))
783                         goto out_free;
784
785                 if (!try_module_get(prot->owner))
786                         goto out_free;
787         }
788         return sk;
789
790 out_free:
791         if (slab != NULL)
792                 kmem_cache_free(slab, sk);
793         else
794                 kfree(sk);
795         return NULL;
796 }
797
798 void sk_free(struct sock *sk)
799 {
800         struct sk_filter *filter;
801         struct module *owner = sk->sk_prot_creator->owner;
802
803         if (sk->sk_destruct)
804                 sk->sk_destruct(sk);
805
806         filter = sk->sk_filter;
807         if (filter) {
808                 sk_filter_release(sk, filter);
809                 sk->sk_filter = NULL;
810         }
811
812         sock_disable_timestamp(sk);
813
814         if (atomic_read(&sk->sk_omem_alloc))
815                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
816                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
817
818         security_sk_free(sk);
819         if (sk->sk_prot_creator->slab != NULL)
820                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
821         else
822                 kfree(sk);
823         module_put(owner);
824 }
825
826 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
827 {
828         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
829
830         if (newsk != NULL) {
831                 struct sk_filter *filter;
832
833                 memcpy(newsk, sk, sk->sk_prot->obj_size);
834
835                 /* SANITY */
836                 sk_node_init(&newsk->sk_node);
837                 sock_lock_init(newsk);
838                 bh_lock_sock(newsk);
839
840                 atomic_set(&newsk->sk_rmem_alloc, 0);
841                 atomic_set(&newsk->sk_wmem_alloc, 0);
842                 atomic_set(&newsk->sk_omem_alloc, 0);
843                 skb_queue_head_init(&newsk->sk_receive_queue);
844                 skb_queue_head_init(&newsk->sk_write_queue);
845 #ifdef CONFIG_NET_DMA
846                 skb_queue_head_init(&newsk->sk_async_wait_queue);
847 #endif
848
849                 rwlock_init(&newsk->sk_dst_lock);
850                 rwlock_init(&newsk->sk_callback_lock);
851
852                 newsk->sk_dst_cache     = NULL;
853                 newsk->sk_wmem_queued   = 0;
854                 newsk->sk_forward_alloc = 0;
855                 newsk->sk_send_head     = NULL;
856                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
857                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
858
859                 sock_reset_flag(newsk, SOCK_DONE);
860                 skb_queue_head_init(&newsk->sk_error_queue);
861
862                 filter = newsk->sk_filter;
863                 if (filter != NULL)
864                         sk_filter_charge(newsk, filter);
865
866                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
867                         /* It is still raw copy of parent, so invalidate
868                          * destructor and make plain sk_free() */
869                         newsk->sk_destruct = NULL;
870                         sk_free(newsk);
871                         newsk = NULL;
872                         goto out;
873                 }
874
875                 newsk->sk_err      = 0;
876                 newsk->sk_priority = 0;
877                 atomic_set(&newsk->sk_refcnt, 2);
878
879                 /*
880                  * Increment the counter in the same struct proto as the master
881                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
882                  * is the same as sk->sk_prot->socks, as this field was copied
883                  * with memcpy).
884                  *
885                  * This _changes_ the previous behaviour, where
886                  * tcp_create_openreq_child always was incrementing the
887                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
888                  * to be taken into account in all callers. -acme
889                  */
890                 sk_refcnt_debug_inc(newsk);
891                 newsk->sk_socket = NULL;
892                 newsk->sk_sleep  = NULL;
893
894                 if (newsk->sk_prot->sockets_allocated)
895                         atomic_inc(newsk->sk_prot->sockets_allocated);
896         }
897 out:
898         return newsk;
899 }
900
901 EXPORT_SYMBOL_GPL(sk_clone);
902
903 void __init sk_init(void)
904 {
905         if (num_physpages <= 4096) {
906                 sysctl_wmem_max = 32767;
907                 sysctl_rmem_max = 32767;
908                 sysctl_wmem_default = 32767;
909                 sysctl_rmem_default = 32767;
910         } else if (num_physpages >= 131072) {
911                 sysctl_wmem_max = 131071;
912                 sysctl_rmem_max = 131071;
913         }
914 }
915
916 /*
917  *      Simple resource managers for sockets.
918  */
919
920
921 /* 
922  * Write buffer destructor automatically called from kfree_skb. 
923  */
924 void sock_wfree(struct sk_buff *skb)
925 {
926         struct sock *sk = skb->sk;
927
928         /* In case it might be waiting for more memory. */
929         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
930         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
931                 sk->sk_write_space(sk);
932         sock_put(sk);
933 }
934
935 /* 
936  * Read buffer destructor automatically called from kfree_skb. 
937  */
938 void sock_rfree(struct sk_buff *skb)
939 {
940         struct sock *sk = skb->sk;
941
942         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
943 }
944
945
946 int sock_i_uid(struct sock *sk)
947 {
948         int uid;
949
950         read_lock(&sk->sk_callback_lock);
951         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
952         read_unlock(&sk->sk_callback_lock);
953         return uid;
954 }
955
956 unsigned long sock_i_ino(struct sock *sk)
957 {
958         unsigned long ino;
959
960         read_lock(&sk->sk_callback_lock);
961         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
962         read_unlock(&sk->sk_callback_lock);
963         return ino;
964 }
965
966 /*
967  * Allocate a skb from the socket's send buffer.
968  */
969 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
970                              gfp_t priority)
971 {
972         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
973                 struct sk_buff * skb = alloc_skb(size, priority);
974                 if (skb) {
975                         skb_set_owner_w(skb, sk);
976                         return skb;
977                 }
978         }
979         return NULL;
980 }
981
982 /*
983  * Allocate a skb from the socket's receive buffer.
984  */ 
985 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
986                              gfp_t priority)
987 {
988         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
989                 struct sk_buff *skb = alloc_skb(size, priority);
990                 if (skb) {
991                         skb_set_owner_r(skb, sk);
992                         return skb;
993                 }
994         }
995         return NULL;
996 }
997
998 /* 
999  * Allocate a memory block from the socket's option memory buffer.
1000  */ 
1001 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1002 {
1003         if ((unsigned)size <= sysctl_optmem_max &&
1004             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1005                 void *mem;
1006                 /* First do the add, to avoid the race if kmalloc
1007                  * might sleep.
1008                  */
1009                 atomic_add(size, &sk->sk_omem_alloc);
1010                 mem = kmalloc(size, priority);
1011                 if (mem)
1012                         return mem;
1013                 atomic_sub(size, &sk->sk_omem_alloc);
1014         }
1015         return NULL;
1016 }
1017
1018 /*
1019  * Free an option memory block.
1020  */
1021 void sock_kfree_s(struct sock *sk, void *mem, int size)
1022 {
1023         kfree(mem);
1024         atomic_sub(size, &sk->sk_omem_alloc);
1025 }
1026
1027 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1028    I think, these locks should be removed for datagram sockets.
1029  */
1030 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1031 {
1032         DEFINE_WAIT(wait);
1033
1034         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1035         for (;;) {
1036                 if (!timeo)
1037                         break;
1038                 if (signal_pending(current))
1039                         break;
1040                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1041                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1042                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1043                         break;
1044                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1045                         break;
1046                 if (sk->sk_err)
1047                         break;
1048                 timeo = schedule_timeout(timeo);
1049         }
1050         finish_wait(sk->sk_sleep, &wait);
1051         return timeo;
1052 }
1053
1054
1055 /*
1056  *      Generic send/receive buffer handlers
1057  */
1058
1059 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1060                                             unsigned long header_len,
1061                                             unsigned long data_len,
1062                                             int noblock, int *errcode)
1063 {
1064         struct sk_buff *skb;
1065         gfp_t gfp_mask;
1066         long timeo;
1067         int err;
1068
1069         gfp_mask = sk->sk_allocation;
1070         if (gfp_mask & __GFP_WAIT)
1071                 gfp_mask |= __GFP_REPEAT;
1072
1073         timeo = sock_sndtimeo(sk, noblock);
1074         while (1) {
1075                 err = sock_error(sk);
1076                 if (err != 0)
1077                         goto failure;
1078
1079                 err = -EPIPE;
1080                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1081                         goto failure;
1082
1083                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1084                         skb = alloc_skb(header_len, sk->sk_allocation);
1085                         if (skb) {
1086                                 int npages;
1087                                 int i;
1088
1089                                 /* No pages, we're done... */
1090                                 if (!data_len)
1091                                         break;
1092
1093                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1094                                 skb->truesize += data_len;
1095                                 skb_shinfo(skb)->nr_frags = npages;
1096                                 for (i = 0; i < npages; i++) {
1097                                         struct page *page;
1098                                         skb_frag_t *frag;
1099
1100                                         page = alloc_pages(sk->sk_allocation, 0);
1101                                         if (!page) {
1102                                                 err = -ENOBUFS;
1103                                                 skb_shinfo(skb)->nr_frags = i;
1104                                                 kfree_skb(skb);
1105                                                 goto failure;
1106                                         }
1107
1108                                         frag = &skb_shinfo(skb)->frags[i];
1109                                         frag->page = page;
1110                                         frag->page_offset = 0;
1111                                         frag->size = (data_len >= PAGE_SIZE ?
1112                                                       PAGE_SIZE :
1113                                                       data_len);
1114                                         data_len -= PAGE_SIZE;
1115                                 }
1116
1117                                 /* Full success... */
1118                                 break;
1119                         }
1120                         err = -ENOBUFS;
1121                         goto failure;
1122                 }
1123                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1124                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1125                 err = -EAGAIN;
1126                 if (!timeo)
1127                         goto failure;
1128                 if (signal_pending(current))
1129                         goto interrupted;
1130                 timeo = sock_wait_for_wmem(sk, timeo);
1131         }
1132
1133         skb_set_owner_w(skb, sk);
1134         return skb;
1135
1136 interrupted:
1137         err = sock_intr_errno(timeo);
1138 failure:
1139         *errcode = err;
1140         return NULL;
1141 }
1142
1143 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1144                                     int noblock, int *errcode)
1145 {
1146         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1147 }
1148
1149 static void __lock_sock(struct sock *sk)
1150 {
1151         DEFINE_WAIT(wait);
1152
1153         for(;;) {
1154                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1155                                         TASK_UNINTERRUPTIBLE);
1156                 spin_unlock_bh(&sk->sk_lock.slock);
1157                 schedule();
1158                 spin_lock_bh(&sk->sk_lock.slock);
1159                 if(!sock_owned_by_user(sk))
1160                         break;
1161         }
1162         finish_wait(&sk->sk_lock.wq, &wait);
1163 }
1164
1165 static void __release_sock(struct sock *sk)
1166 {
1167         struct sk_buff *skb = sk->sk_backlog.head;
1168
1169         do {
1170                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1171                 bh_unlock_sock(sk);
1172
1173                 do {
1174                         struct sk_buff *next = skb->next;
1175
1176                         skb->next = NULL;
1177                         sk->sk_backlog_rcv(sk, skb);
1178
1179                         /*
1180                          * We are in process context here with softirqs
1181                          * disabled, use cond_resched_softirq() to preempt.
1182                          * This is safe to do because we've taken the backlog
1183                          * queue private:
1184                          */
1185                         cond_resched_softirq();
1186
1187                         skb = next;
1188                 } while (skb != NULL);
1189
1190                 bh_lock_sock(sk);
1191         } while((skb = sk->sk_backlog.head) != NULL);
1192 }
1193
1194 /**
1195  * sk_wait_data - wait for data to arrive at sk_receive_queue
1196  * @sk:    sock to wait on
1197  * @timeo: for how long
1198  *
1199  * Now socket state including sk->sk_err is changed only under lock,
1200  * hence we may omit checks after joining wait queue.
1201  * We check receive queue before schedule() only as optimization;
1202  * it is very likely that release_sock() added new data.
1203  */
1204 int sk_wait_data(struct sock *sk, long *timeo)
1205 {
1206         int rc;
1207         DEFINE_WAIT(wait);
1208
1209         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1210         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1211         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1212         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1213         finish_wait(sk->sk_sleep, &wait);
1214         return rc;
1215 }
1216
1217 EXPORT_SYMBOL(sk_wait_data);
1218
1219 /*
1220  * Set of default routines for initialising struct proto_ops when
1221  * the protocol does not support a particular function. In certain
1222  * cases where it makes no sense for a protocol to have a "do nothing"
1223  * function, some default processing is provided.
1224  */
1225
1226 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1227 {
1228         return -EOPNOTSUPP;
1229 }
1230
1231 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1232                     int len, int flags)
1233 {
1234         return -EOPNOTSUPP;
1235 }
1236
1237 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1238 {
1239         return -EOPNOTSUPP;
1240 }
1241
1242 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1243 {
1244         return -EOPNOTSUPP;
1245 }
1246
1247 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1248                     int *len, int peer)
1249 {
1250         return -EOPNOTSUPP;
1251 }
1252
1253 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1254 {
1255         return 0;
1256 }
1257
1258 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1259 {
1260         return -EOPNOTSUPP;
1261 }
1262
1263 int sock_no_listen(struct socket *sock, int backlog)
1264 {
1265         return -EOPNOTSUPP;
1266 }
1267
1268 int sock_no_shutdown(struct socket *sock, int how)
1269 {
1270         return -EOPNOTSUPP;
1271 }
1272
1273 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1274                     char __user *optval, int optlen)
1275 {
1276         return -EOPNOTSUPP;
1277 }
1278
1279 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1280                     char __user *optval, int __user *optlen)
1281 {
1282         return -EOPNOTSUPP;
1283 }
1284
1285 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1286                     size_t len)
1287 {
1288         return -EOPNOTSUPP;
1289 }
1290
1291 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1292                     size_t len, int flags)
1293 {
1294         return -EOPNOTSUPP;
1295 }
1296
1297 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1298 {
1299         /* Mirror missing mmap method error code */
1300         return -ENODEV;
1301 }
1302
1303 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1304 {
1305         ssize_t res;
1306         struct msghdr msg = {.msg_flags = flags};
1307         struct kvec iov;
1308         char *kaddr = kmap(page);
1309         iov.iov_base = kaddr + offset;
1310         iov.iov_len = size;
1311         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1312         kunmap(page);
1313         return res;
1314 }
1315
1316 /*
1317  *      Default Socket Callbacks
1318  */
1319
1320 static void sock_def_wakeup(struct sock *sk)
1321 {
1322         read_lock(&sk->sk_callback_lock);
1323         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1324                 wake_up_interruptible_all(sk->sk_sleep);
1325         read_unlock(&sk->sk_callback_lock);
1326 }
1327
1328 static void sock_def_error_report(struct sock *sk)
1329 {
1330         read_lock(&sk->sk_callback_lock);
1331         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1332                 wake_up_interruptible(sk->sk_sleep);
1333         sk_wake_async(sk,0,POLL_ERR); 
1334         read_unlock(&sk->sk_callback_lock);
1335 }
1336
1337 static void sock_def_readable(struct sock *sk, int len)
1338 {
1339         read_lock(&sk->sk_callback_lock);
1340         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1341                 wake_up_interruptible(sk->sk_sleep);
1342         sk_wake_async(sk,1,POLL_IN);
1343         read_unlock(&sk->sk_callback_lock);
1344 }
1345
1346 static void sock_def_write_space(struct sock *sk)
1347 {
1348         read_lock(&sk->sk_callback_lock);
1349
1350         /* Do not wake up a writer until he can make "significant"
1351          * progress.  --DaveM
1352          */
1353         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1354                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1355                         wake_up_interruptible(sk->sk_sleep);
1356
1357                 /* Should agree with poll, otherwise some programs break */
1358                 if (sock_writeable(sk))
1359                         sk_wake_async(sk, 2, POLL_OUT);
1360         }
1361
1362         read_unlock(&sk->sk_callback_lock);
1363 }
1364
1365 static void sock_def_destruct(struct sock *sk)
1366 {
1367         kfree(sk->sk_protinfo);
1368 }
1369
1370 void sk_send_sigurg(struct sock *sk)
1371 {
1372         if (sk->sk_socket && sk->sk_socket->file)
1373                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1374                         sk_wake_async(sk, 3, POLL_PRI);
1375 }
1376
1377 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1378                     unsigned long expires)
1379 {
1380         if (!mod_timer(timer, expires))
1381                 sock_hold(sk);
1382 }
1383
1384 EXPORT_SYMBOL(sk_reset_timer);
1385
1386 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1387 {
1388         if (timer_pending(timer) && del_timer(timer))
1389                 __sock_put(sk);
1390 }
1391
1392 EXPORT_SYMBOL(sk_stop_timer);
1393
1394 void sock_init_data(struct socket *sock, struct sock *sk)
1395 {
1396         skb_queue_head_init(&sk->sk_receive_queue);
1397         skb_queue_head_init(&sk->sk_write_queue);
1398         skb_queue_head_init(&sk->sk_error_queue);
1399 #ifdef CONFIG_NET_DMA
1400         skb_queue_head_init(&sk->sk_async_wait_queue);
1401 #endif
1402
1403         sk->sk_send_head        =       NULL;
1404
1405         init_timer(&sk->sk_timer);
1406         
1407         sk->sk_allocation       =       GFP_KERNEL;
1408         sk->sk_rcvbuf           =       sysctl_rmem_default;
1409         sk->sk_sndbuf           =       sysctl_wmem_default;
1410         sk->sk_state            =       TCP_CLOSE;
1411         sk->sk_socket           =       sock;
1412
1413         sock_set_flag(sk, SOCK_ZAPPED);
1414
1415         if(sock)
1416         {
1417                 sk->sk_type     =       sock->type;
1418                 sk->sk_sleep    =       &sock->wait;
1419                 sock->sk        =       sk;
1420         } else
1421                 sk->sk_sleep    =       NULL;
1422
1423         rwlock_init(&sk->sk_dst_lock);
1424         rwlock_init(&sk->sk_callback_lock);
1425
1426         sk->sk_state_change     =       sock_def_wakeup;
1427         sk->sk_data_ready       =       sock_def_readable;
1428         sk->sk_write_space      =       sock_def_write_space;
1429         sk->sk_error_report     =       sock_def_error_report;
1430         sk->sk_destruct         =       sock_def_destruct;
1431
1432         sk->sk_sndmsg_page      =       NULL;
1433         sk->sk_sndmsg_off       =       0;
1434
1435         sk->sk_peercred.pid     =       0;
1436         sk->sk_peercred.uid     =       -1;
1437         sk->sk_peercred.gid     =       -1;
1438         sk->sk_write_pending    =       0;
1439         sk->sk_rcvlowat         =       1;
1440         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1441         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1442
1443         sk->sk_stamp.tv_sec     = -1L;
1444         sk->sk_stamp.tv_usec    = -1L;
1445
1446         atomic_set(&sk->sk_refcnt, 1);
1447 }
1448
1449 void fastcall lock_sock(struct sock *sk)
1450 {
1451         might_sleep();
1452         spin_lock_bh(&(sk->sk_lock.slock));
1453         if (sk->sk_lock.owner)
1454                 __lock_sock(sk);
1455         sk->sk_lock.owner = (void *)1;
1456         spin_unlock_bh(&(sk->sk_lock.slock));
1457 }
1458
1459 EXPORT_SYMBOL(lock_sock);
1460
1461 void fastcall release_sock(struct sock *sk)
1462 {
1463         spin_lock_bh(&(sk->sk_lock.slock));
1464         if (sk->sk_backlog.tail)
1465                 __release_sock(sk);
1466         sk->sk_lock.owner = NULL;
1467         if (waitqueue_active(&(sk->sk_lock.wq)))
1468                 wake_up(&(sk->sk_lock.wq));
1469         spin_unlock_bh(&(sk->sk_lock.slock));
1470 }
1471 EXPORT_SYMBOL(release_sock);
1472
1473 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1474
1475         if (!sock_flag(sk, SOCK_TIMESTAMP))
1476                 sock_enable_timestamp(sk);
1477         if (sk->sk_stamp.tv_sec == -1) 
1478                 return -ENOENT;
1479         if (sk->sk_stamp.tv_sec == 0)
1480                 do_gettimeofday(&sk->sk_stamp);
1481         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1482                 -EFAULT : 0; 
1483
1484 EXPORT_SYMBOL(sock_get_timestamp);
1485
1486 void sock_enable_timestamp(struct sock *sk)
1487 {       
1488         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1489                 sock_set_flag(sk, SOCK_TIMESTAMP);
1490                 net_enable_timestamp();
1491         }
1492 }
1493 EXPORT_SYMBOL(sock_enable_timestamp); 
1494
1495 /*
1496  *      Get a socket option on an socket.
1497  *
1498  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1499  *      asynchronous errors should be reported by getsockopt. We assume
1500  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1501  */
1502 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1503                            char __user *optval, int __user *optlen)
1504 {
1505         struct sock *sk = sock->sk;
1506
1507         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1508 }
1509
1510 EXPORT_SYMBOL(sock_common_getsockopt);
1511
1512 #ifdef CONFIG_COMPAT
1513 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1514                                   char __user *optval, int __user *optlen)
1515 {
1516         struct sock *sk = sock->sk;
1517
1518         if (sk->sk_prot->compat_setsockopt != NULL)
1519                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1520                                                       optval, optlen);
1521         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1522 }
1523 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1524 #endif
1525
1526 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1527                         struct msghdr *msg, size_t size, int flags)
1528 {
1529         struct sock *sk = sock->sk;
1530         int addr_len = 0;
1531         int err;
1532
1533         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1534                                    flags & ~MSG_DONTWAIT, &addr_len);
1535         if (err >= 0)
1536                 msg->msg_namelen = addr_len;
1537         return err;
1538 }
1539
1540 EXPORT_SYMBOL(sock_common_recvmsg);
1541
1542 /*
1543  *      Set socket options on an inet socket.
1544  */
1545 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1546                            char __user *optval, int optlen)
1547 {
1548         struct sock *sk = sock->sk;
1549
1550         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1551 }
1552
1553 EXPORT_SYMBOL(sock_common_setsockopt);
1554
1555 #ifdef CONFIG_COMPAT
1556 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1557                                   char __user *optval, int optlen)
1558 {
1559         struct sock *sk = sock->sk;
1560
1561         if (sk->sk_prot->compat_setsockopt != NULL)
1562                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1563                                                       optval, optlen);
1564         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1565 }
1566 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1567 #endif
1568
1569 void sk_common_release(struct sock *sk)
1570 {
1571         if (sk->sk_prot->destroy)
1572                 sk->sk_prot->destroy(sk);
1573
1574         /*
1575          * Observation: when sock_common_release is called, processes have
1576          * no access to socket. But net still has.
1577          * Step one, detach it from networking:
1578          *
1579          * A. Remove from hash tables.
1580          */
1581
1582         sk->sk_prot->unhash(sk);
1583
1584         /*
1585          * In this point socket cannot receive new packets, but it is possible
1586          * that some packets are in flight because some CPU runs receiver and
1587          * did hash table lookup before we unhashed socket. They will achieve
1588          * receive queue and will be purged by socket destructor.
1589          *
1590          * Also we still have packets pending on receive queue and probably,
1591          * our own packets waiting in device queues. sock_destroy will drain
1592          * receive queue, but transmitted packets will delay socket destruction
1593          * until the last reference will be released.
1594          */
1595
1596         sock_orphan(sk);
1597
1598         xfrm_sk_free_policy(sk);
1599
1600         sk_refcnt_debug_release(sk);
1601         sock_put(sk);
1602 }
1603
1604 EXPORT_SYMBOL(sk_common_release);
1605
1606 static DEFINE_RWLOCK(proto_list_lock);
1607 static LIST_HEAD(proto_list);
1608
1609 int proto_register(struct proto *prot, int alloc_slab)
1610 {
1611         char *request_sock_slab_name = NULL;
1612         char *timewait_sock_slab_name;
1613         int rc = -ENOBUFS;
1614
1615         if (alloc_slab) {
1616                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1617                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1618
1619                 if (prot->slab == NULL) {
1620                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1621                                prot->name);
1622                         goto out;
1623                 }
1624
1625                 if (prot->rsk_prot != NULL) {
1626                         static const char mask[] = "request_sock_%s";
1627
1628                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1629                         if (request_sock_slab_name == NULL)
1630                                 goto out_free_sock_slab;
1631
1632                         sprintf(request_sock_slab_name, mask, prot->name);
1633                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1634                                                                  prot->rsk_prot->obj_size, 0,
1635                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1636
1637                         if (prot->rsk_prot->slab == NULL) {
1638                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1639                                        prot->name);
1640                                 goto out_free_request_sock_slab_name;
1641                         }
1642                 }
1643
1644                 if (prot->twsk_prot != NULL) {
1645                         static const char mask[] = "tw_sock_%s";
1646
1647                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1648
1649                         if (timewait_sock_slab_name == NULL)
1650                                 goto out_free_request_sock_slab;
1651
1652                         sprintf(timewait_sock_slab_name, mask, prot->name);
1653                         prot->twsk_prot->twsk_slab =
1654                                 kmem_cache_create(timewait_sock_slab_name,
1655                                                   prot->twsk_prot->twsk_obj_size,
1656                                                   0, SLAB_HWCACHE_ALIGN,
1657                                                   NULL, NULL);
1658                         if (prot->twsk_prot->twsk_slab == NULL)
1659                                 goto out_free_timewait_sock_slab_name;
1660                 }
1661         }
1662
1663         write_lock(&proto_list_lock);
1664         list_add(&prot->node, &proto_list);
1665         write_unlock(&proto_list_lock);
1666         rc = 0;
1667 out:
1668         return rc;
1669 out_free_timewait_sock_slab_name:
1670         kfree(timewait_sock_slab_name);
1671 out_free_request_sock_slab:
1672         if (prot->rsk_prot && prot->rsk_prot->slab) {
1673                 kmem_cache_destroy(prot->rsk_prot->slab);
1674                 prot->rsk_prot->slab = NULL;
1675         }
1676 out_free_request_sock_slab_name:
1677         kfree(request_sock_slab_name);
1678 out_free_sock_slab:
1679         kmem_cache_destroy(prot->slab);
1680         prot->slab = NULL;
1681         goto out;
1682 }
1683
1684 EXPORT_SYMBOL(proto_register);
1685
1686 void proto_unregister(struct proto *prot)
1687 {
1688         write_lock(&proto_list_lock);
1689         list_del(&prot->node);
1690         write_unlock(&proto_list_lock);
1691
1692         if (prot->slab != NULL) {
1693                 kmem_cache_destroy(prot->slab);
1694                 prot->slab = NULL;
1695         }
1696
1697         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1698                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1699
1700                 kmem_cache_destroy(prot->rsk_prot->slab);
1701                 kfree(name);
1702                 prot->rsk_prot->slab = NULL;
1703         }
1704
1705         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1706                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1707
1708                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1709                 kfree(name);
1710                 prot->twsk_prot->twsk_slab = NULL;
1711         }
1712 }
1713
1714 EXPORT_SYMBOL(proto_unregister);
1715
1716 #ifdef CONFIG_PROC_FS
1717 static inline struct proto *__proto_head(void)
1718 {
1719         return list_entry(proto_list.next, struct proto, node);
1720 }
1721
1722 static inline struct proto *proto_head(void)
1723 {
1724         return list_empty(&proto_list) ? NULL : __proto_head();
1725 }
1726
1727 static inline struct proto *proto_next(struct proto *proto)
1728 {
1729         return proto->node.next == &proto_list ? NULL :
1730                 list_entry(proto->node.next, struct proto, node);
1731 }
1732
1733 static inline struct proto *proto_get_idx(loff_t pos)
1734 {
1735         struct proto *proto;
1736         loff_t i = 0;
1737
1738         list_for_each_entry(proto, &proto_list, node)
1739                 if (i++ == pos)
1740                         goto out;
1741
1742         proto = NULL;
1743 out:
1744         return proto;
1745 }
1746
1747 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1748 {
1749         read_lock(&proto_list_lock);
1750         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1751 }
1752
1753 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1754 {
1755         ++*pos;
1756         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1757 }
1758
1759 static void proto_seq_stop(struct seq_file *seq, void *v)
1760 {
1761         read_unlock(&proto_list_lock);
1762 }
1763
1764 static char proto_method_implemented(const void *method)
1765 {
1766         return method == NULL ? 'n' : 'y';
1767 }
1768
1769 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1770 {
1771         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1772                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1773                    proto->name,
1774                    proto->obj_size,
1775                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1776                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1777                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1778                    proto->max_header,
1779                    proto->slab == NULL ? "no" : "yes",
1780                    module_name(proto->owner),
1781                    proto_method_implemented(proto->close),
1782                    proto_method_implemented(proto->connect),
1783                    proto_method_implemented(proto->disconnect),
1784                    proto_method_implemented(proto->accept),
1785                    proto_method_implemented(proto->ioctl),
1786                    proto_method_implemented(proto->init),
1787                    proto_method_implemented(proto->destroy),
1788                    proto_method_implemented(proto->shutdown),
1789                    proto_method_implemented(proto->setsockopt),
1790                    proto_method_implemented(proto->getsockopt),
1791                    proto_method_implemented(proto->sendmsg),
1792                    proto_method_implemented(proto->recvmsg),
1793                    proto_method_implemented(proto->sendpage),
1794                    proto_method_implemented(proto->bind),
1795                    proto_method_implemented(proto->backlog_rcv),
1796                    proto_method_implemented(proto->hash),
1797                    proto_method_implemented(proto->unhash),
1798                    proto_method_implemented(proto->get_port),
1799                    proto_method_implemented(proto->enter_memory_pressure));
1800 }
1801
1802 static int proto_seq_show(struct seq_file *seq, void *v)
1803 {
1804         if (v == SEQ_START_TOKEN)
1805                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1806                            "protocol",
1807                            "size",
1808                            "sockets",
1809                            "memory",
1810                            "press",
1811                            "maxhdr",
1812                            "slab",
1813                            "module",
1814                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1815         else
1816                 proto_seq_printf(seq, v);
1817         return 0;
1818 }
1819
1820 static struct seq_operations proto_seq_ops = {
1821         .start  = proto_seq_start,
1822         .next   = proto_seq_next,
1823         .stop   = proto_seq_stop,
1824         .show   = proto_seq_show,
1825 };
1826
1827 static int proto_seq_open(struct inode *inode, struct file *file)
1828 {
1829         return seq_open(file, &proto_seq_ops);
1830 }
1831
1832 static struct file_operations proto_seq_fops = {
1833         .owner          = THIS_MODULE,
1834         .open           = proto_seq_open,
1835         .read           = seq_read,
1836         .llseek         = seq_lseek,
1837         .release        = seq_release,
1838 };
1839
1840 static int __init proto_init(void)
1841 {
1842         /* register /proc/net/protocols */
1843         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1844 }
1845
1846 subsys_initcall(proto_init);
1847
1848 #endif /* PROC_FS */
1849
1850 EXPORT_SYMBOL(sk_alloc);
1851 EXPORT_SYMBOL(sk_free);
1852 EXPORT_SYMBOL(sk_send_sigurg);
1853 EXPORT_SYMBOL(sock_alloc_send_skb);
1854 EXPORT_SYMBOL(sock_init_data);
1855 EXPORT_SYMBOL(sock_kfree_s);
1856 EXPORT_SYMBOL(sock_kmalloc);
1857 EXPORT_SYMBOL(sock_no_accept);
1858 EXPORT_SYMBOL(sock_no_bind);
1859 EXPORT_SYMBOL(sock_no_connect);
1860 EXPORT_SYMBOL(sock_no_getname);
1861 EXPORT_SYMBOL(sock_no_getsockopt);
1862 EXPORT_SYMBOL(sock_no_ioctl);
1863 EXPORT_SYMBOL(sock_no_listen);
1864 EXPORT_SYMBOL(sock_no_mmap);
1865 EXPORT_SYMBOL(sock_no_poll);
1866 EXPORT_SYMBOL(sock_no_recvmsg);
1867 EXPORT_SYMBOL(sock_no_sendmsg);
1868 EXPORT_SYMBOL(sock_no_sendpage);
1869 EXPORT_SYMBOL(sock_no_setsockopt);
1870 EXPORT_SYMBOL(sock_no_shutdown);
1871 EXPORT_SYMBOL(sock_no_socketpair);
1872 EXPORT_SYMBOL(sock_rfree);
1873 EXPORT_SYMBOL(sock_setsockopt);
1874 EXPORT_SYMBOL(sock_wfree);
1875 EXPORT_SYMBOL(sock_wmalloc);
1876 EXPORT_SYMBOL(sock_i_uid);
1877 EXPORT_SYMBOL(sock_i_ino);
1878 EXPORT_SYMBOL(sysctl_optmem_max);
1879 #ifdef CONFIG_SYSCTL
1880 EXPORT_SYMBOL(sysctl_rmem_max);
1881 EXPORT_SYMBOL(sysctl_wmem_max);
1882 #endif