Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6
[linux-2.6] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114
115 #include <asm/uaccess.h>
116 #include <asm/system.h>
117
118 #include <linux/netdevice.h>
119 #include <net/protocol.h>
120 #include <linux/skbuff.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125
126 #include <linux/filter.h>
127
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131
132 /* Take into consideration the size of the struct sk_buff overhead in the
133  * determination of these values, since that is non-constant across
134  * platforms.  This makes socket queueing behavior and performance
135  * not depend upon such differences.
136  */
137 #define _SK_MEM_PACKETS         256
138 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
139 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141
142 /* Run time adjustable parameters. */
143 __u32 sysctl_wmem_max = SK_WMEM_MAX;
144 __u32 sysctl_rmem_max = SK_RMEM_MAX;
145 __u32 sysctl_wmem_default = SK_WMEM_MAX;
146 __u32 sysctl_rmem_default = SK_RMEM_MAX;
147
148 /* Maximal space eaten by iovec or ancilliary data plus some space */
149 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
150
151 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
152 {
153         struct timeval tv;
154
155         if (optlen < sizeof(tv))
156                 return -EINVAL;
157         if (copy_from_user(&tv, optval, sizeof(tv)))
158                 return -EFAULT;
159
160         *timeo_p = MAX_SCHEDULE_TIMEOUT;
161         if (tv.tv_sec == 0 && tv.tv_usec == 0)
162                 return 0;
163         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
164                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
165         return 0;
166 }
167
168 static void sock_warn_obsolete_bsdism(const char *name)
169 {
170         static int warned;
171         static char warncomm[TASK_COMM_LEN];
172         if (strcmp(warncomm, current->comm) && warned < 5) { 
173                 strcpy(warncomm,  current->comm); 
174                 printk(KERN_WARNING "process `%s' is using obsolete "
175                        "%s SO_BSDCOMPAT\n", warncomm, name);
176                 warned++;
177         }
178 }
179
180 static void sock_disable_timestamp(struct sock *sk)
181 {       
182         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
183                 sock_reset_flag(sk, SOCK_TIMESTAMP);
184                 net_disable_timestamp();
185         }
186 }
187
188
189 /*
190  *      This is meant for all protocols to use and covers goings on
191  *      at the socket level. Everything here is generic.
192  */
193
194 int sock_setsockopt(struct socket *sock, int level, int optname,
195                     char __user *optval, int optlen)
196 {
197         struct sock *sk=sock->sk;
198         struct sk_filter *filter;
199         int val;
200         int valbool;
201         struct linger ling;
202         int ret = 0;
203         
204         /*
205          *      Options without arguments
206          */
207
208 #ifdef SO_DONTLINGER            /* Compatibility item... */
209         if (optname == SO_DONTLINGER) {
210                 lock_sock(sk);
211                 sock_reset_flag(sk, SOCK_LINGER);
212                 release_sock(sk);
213                 return 0;
214         }
215 #endif
216         
217         if(optlen<sizeof(int))
218                 return(-EINVAL);
219         
220         if (get_user(val, (int __user *)optval))
221                 return -EFAULT;
222         
223         valbool = val?1:0;
224
225         lock_sock(sk);
226
227         switch(optname) 
228         {
229                 case SO_DEBUG:  
230                         if(val && !capable(CAP_NET_ADMIN))
231                         {
232                                 ret = -EACCES;
233                         }
234                         else if (valbool)
235                                 sock_set_flag(sk, SOCK_DBG);
236                         else
237                                 sock_reset_flag(sk, SOCK_DBG);
238                         break;
239                 case SO_REUSEADDR:
240                         sk->sk_reuse = valbool;
241                         break;
242                 case SO_TYPE:
243                 case SO_ERROR:
244                         ret = -ENOPROTOOPT;
245                         break;
246                 case SO_DONTROUTE:
247                         if (valbool)
248                                 sock_set_flag(sk, SOCK_LOCALROUTE);
249                         else
250                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
251                         break;
252                 case SO_BROADCAST:
253                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
254                         break;
255                 case SO_SNDBUF:
256                         /* Don't error on this BSD doesn't and if you think
257                            about it this is right. Otherwise apps have to
258                            play 'guess the biggest size' games. RCVBUF/SNDBUF
259                            are treated in BSD as hints */
260                            
261                         if (val > sysctl_wmem_max)
262                                 val = sysctl_wmem_max;
263 set_sndbuf:
264                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
265                         if ((val * 2) < SOCK_MIN_SNDBUF)
266                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
267                         else
268                                 sk->sk_sndbuf = val * 2;
269
270                         /*
271                          *      Wake up sending tasks if we
272                          *      upped the value.
273                          */
274                         sk->sk_write_space(sk);
275                         break;
276
277                 case SO_SNDBUFFORCE:
278                         if (!capable(CAP_NET_ADMIN)) {
279                                 ret = -EPERM;
280                                 break;
281                         }
282                         goto set_sndbuf;
283
284                 case SO_RCVBUF:
285                         /* Don't error on this BSD doesn't and if you think
286                            about it this is right. Otherwise apps have to
287                            play 'guess the biggest size' games. RCVBUF/SNDBUF
288                            are treated in BSD as hints */
289                           
290                         if (val > sysctl_rmem_max)
291                                 val = sysctl_rmem_max;
292 set_rcvbuf:
293                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
294                         /* FIXME: is this lower bound the right one? */
295                         if ((val * 2) < SOCK_MIN_RCVBUF)
296                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
297                         else
298                                 sk->sk_rcvbuf = val * 2;
299                         break;
300
301                 case SO_RCVBUFFORCE:
302                         if (!capable(CAP_NET_ADMIN)) {
303                                 ret = -EPERM;
304                                 break;
305                         }
306                         goto set_rcvbuf;
307
308                 case SO_KEEPALIVE:
309 #ifdef CONFIG_INET
310                         if (sk->sk_protocol == IPPROTO_TCP)
311                                 tcp_set_keepalive(sk, valbool);
312 #endif
313                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
314                         break;
315
316                 case SO_OOBINLINE:
317                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
318                         break;
319
320                 case SO_NO_CHECK:
321                         sk->sk_no_check = valbool;
322                         break;
323
324                 case SO_PRIORITY:
325                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
326                                 sk->sk_priority = val;
327                         else
328                                 ret = -EPERM;
329                         break;
330
331                 case SO_LINGER:
332                         if(optlen<sizeof(ling)) {
333                                 ret = -EINVAL;  /* 1003.1g */
334                                 break;
335                         }
336                         if (copy_from_user(&ling,optval,sizeof(ling))) {
337                                 ret = -EFAULT;
338                                 break;
339                         }
340                         if (!ling.l_onoff)
341                                 sock_reset_flag(sk, SOCK_LINGER);
342                         else {
343 #if (BITS_PER_LONG == 32)
344                                 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
345                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
346                                 else
347 #endif
348                                         sk->sk_lingertime = ling.l_linger * HZ;
349                                 sock_set_flag(sk, SOCK_LINGER);
350                         }
351                         break;
352
353                 case SO_BSDCOMPAT:
354                         sock_warn_obsolete_bsdism("setsockopt");
355                         break;
356
357                 case SO_PASSCRED:
358                         if (valbool)
359                                 set_bit(SOCK_PASSCRED, &sock->flags);
360                         else
361                                 clear_bit(SOCK_PASSCRED, &sock->flags);
362                         break;
363
364                 case SO_TIMESTAMP:
365                         if (valbool)  {
366                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
367                                 sock_enable_timestamp(sk);
368                         } else
369                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
370                         break;
371
372                 case SO_RCVLOWAT:
373                         if (val < 0)
374                                 val = INT_MAX;
375                         sk->sk_rcvlowat = val ? : 1;
376                         break;
377
378                 case SO_RCVTIMEO:
379                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
380                         break;
381
382                 case SO_SNDTIMEO:
383                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
384                         break;
385
386 #ifdef CONFIG_NETDEVICES
387                 case SO_BINDTODEVICE:
388                 {
389                         char devname[IFNAMSIZ]; 
390
391                         /* Sorry... */ 
392                         if (!capable(CAP_NET_RAW)) {
393                                 ret = -EPERM;
394                                 break;
395                         }
396
397                         /* Bind this socket to a particular device like "eth0",
398                          * as specified in the passed interface name. If the
399                          * name is "" or the option length is zero the socket 
400                          * is not bound. 
401                          */ 
402
403                         if (!valbool) {
404                                 sk->sk_bound_dev_if = 0;
405                         } else {
406                                 if (optlen > IFNAMSIZ) 
407                                         optlen = IFNAMSIZ; 
408                                 if (copy_from_user(devname, optval, optlen)) {
409                                         ret = -EFAULT;
410                                         break;
411                                 }
412
413                                 /* Remove any cached route for this socket. */
414                                 sk_dst_reset(sk);
415
416                                 if (devname[0] == '\0') {
417                                         sk->sk_bound_dev_if = 0;
418                                 } else {
419                                         struct net_device *dev = dev_get_by_name(devname);
420                                         if (!dev) {
421                                                 ret = -ENODEV;
422                                                 break;
423                                         }
424                                         sk->sk_bound_dev_if = dev->ifindex;
425                                         dev_put(dev);
426                                 }
427                         }
428                         break;
429                 }
430 #endif
431
432
433                 case SO_ATTACH_FILTER:
434                         ret = -EINVAL;
435                         if (optlen == sizeof(struct sock_fprog)) {
436                                 struct sock_fprog fprog;
437
438                                 ret = -EFAULT;
439                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
440                                         break;
441
442                                 ret = sk_attach_filter(&fprog, sk);
443                         }
444                         break;
445
446                 case SO_DETACH_FILTER:
447                         spin_lock_bh(&sk->sk_lock.slock);
448                         filter = sk->sk_filter;
449                         if (filter) {
450                                 sk->sk_filter = NULL;
451                                 spin_unlock_bh(&sk->sk_lock.slock);
452                                 sk_filter_release(sk, filter);
453                                 break;
454                         }
455                         spin_unlock_bh(&sk->sk_lock.slock);
456                         ret = -ENONET;
457                         break;
458
459                 /* We implement the SO_SNDLOWAT etc to
460                    not be settable (1003.1g 5.3) */
461                 default:
462                         ret = -ENOPROTOOPT;
463                         break;
464         }
465         release_sock(sk);
466         return ret;
467 }
468
469
470 int sock_getsockopt(struct socket *sock, int level, int optname,
471                     char __user *optval, int __user *optlen)
472 {
473         struct sock *sk = sock->sk;
474         
475         union
476         {
477                 int val;
478                 struct linger ling;
479                 struct timeval tm;
480         } v;
481         
482         unsigned int lv = sizeof(int);
483         int len;
484         
485         if(get_user(len,optlen))
486                 return -EFAULT;
487         if(len < 0)
488                 return -EINVAL;
489                 
490         switch(optname) 
491         {
492                 case SO_DEBUG:          
493                         v.val = sock_flag(sk, SOCK_DBG);
494                         break;
495                 
496                 case SO_DONTROUTE:
497                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
498                         break;
499                 
500                 case SO_BROADCAST:
501                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
502                         break;
503
504                 case SO_SNDBUF:
505                         v.val = sk->sk_sndbuf;
506                         break;
507                 
508                 case SO_RCVBUF:
509                         v.val = sk->sk_rcvbuf;
510                         break;
511
512                 case SO_REUSEADDR:
513                         v.val = sk->sk_reuse;
514                         break;
515
516                 case SO_KEEPALIVE:
517                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
518                         break;
519
520                 case SO_TYPE:
521                         v.val = sk->sk_type;                            
522                         break;
523
524                 case SO_ERROR:
525                         v.val = -sock_error(sk);
526                         if(v.val==0)
527                                 v.val = xchg(&sk->sk_err_soft, 0);
528                         break;
529
530                 case SO_OOBINLINE:
531                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
532                         break;
533         
534                 case SO_NO_CHECK:
535                         v.val = sk->sk_no_check;
536                         break;
537
538                 case SO_PRIORITY:
539                         v.val = sk->sk_priority;
540                         break;
541                 
542                 case SO_LINGER: 
543                         lv              = sizeof(v.ling);
544                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
545                         v.ling.l_linger = sk->sk_lingertime / HZ;
546                         break;
547                                         
548                 case SO_BSDCOMPAT:
549                         sock_warn_obsolete_bsdism("getsockopt");
550                         break;
551
552                 case SO_TIMESTAMP:
553                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
554                         break;
555
556                 case SO_RCVTIMEO:
557                         lv=sizeof(struct timeval);
558                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
559                                 v.tm.tv_sec = 0;
560                                 v.tm.tv_usec = 0;
561                         } else {
562                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
563                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
564                         }
565                         break;
566
567                 case SO_SNDTIMEO:
568                         lv=sizeof(struct timeval);
569                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
570                                 v.tm.tv_sec = 0;
571                                 v.tm.tv_usec = 0;
572                         } else {
573                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
574                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
575                         }
576                         break;
577
578                 case SO_RCVLOWAT:
579                         v.val = sk->sk_rcvlowat;
580                         break;
581
582                 case SO_SNDLOWAT:
583                         v.val=1;
584                         break; 
585
586                 case SO_PASSCRED:
587                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
588                         break;
589
590                 case SO_PEERCRED:
591                         if (len > sizeof(sk->sk_peercred))
592                                 len = sizeof(sk->sk_peercred);
593                         if (copy_to_user(optval, &sk->sk_peercred, len))
594                                 return -EFAULT;
595                         goto lenout;
596
597                 case SO_PEERNAME:
598                 {
599                         char address[128];
600
601                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
602                                 return -ENOTCONN;
603                         if (lv < len)
604                                 return -EINVAL;
605                         if (copy_to_user(optval, address, len))
606                                 return -EFAULT;
607                         goto lenout;
608                 }
609
610                 /* Dubious BSD thing... Probably nobody even uses it, but
611                  * the UNIX standard wants it for whatever reason... -DaveM
612                  */
613                 case SO_ACCEPTCONN:
614                         v.val = sk->sk_state == TCP_LISTEN;
615                         break;
616
617                 case SO_PEERSEC:
618                         return security_socket_getpeersec(sock, optval, optlen, len);
619
620                 default:
621                         return(-ENOPROTOOPT);
622         }
623         if (len > lv)
624                 len = lv;
625         if (copy_to_user(optval, &v, len))
626                 return -EFAULT;
627 lenout:
628         if (put_user(len, optlen))
629                 return -EFAULT;
630         return 0;
631 }
632
633 /**
634  *      sk_alloc - All socket objects are allocated here
635  *      @family: protocol family
636  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
637  *      @prot: struct proto associated with this new sock instance
638  *      @zero_it: if we should zero the newly allocated sock
639  */
640 struct sock *sk_alloc(int family, unsigned int __nocast priority,
641                       struct proto *prot, int zero_it)
642 {
643         struct sock *sk = NULL;
644         kmem_cache_t *slab = prot->slab;
645
646         if (slab != NULL)
647                 sk = kmem_cache_alloc(slab, priority);
648         else
649                 sk = kmalloc(prot->obj_size, priority);
650
651         if (sk) {
652                 if (zero_it) {
653                         memset(sk, 0, prot->obj_size);
654                         sk->sk_family = family;
655                         /*
656                          * See comment in struct sock definition to understand
657                          * why we need sk_prot_creator -acme
658                          */
659                         sk->sk_prot = sk->sk_prot_creator = prot;
660                         sock_lock_init(sk);
661                 }
662                 
663                 if (security_sk_alloc(sk, family, priority)) {
664                         if (slab != NULL)
665                                 kmem_cache_free(slab, sk);
666                         else
667                                 kfree(sk);
668                         sk = NULL;
669                 } else
670                         __module_get(prot->owner);
671         }
672         return sk;
673 }
674
675 void sk_free(struct sock *sk)
676 {
677         struct sk_filter *filter;
678         struct module *owner = sk->sk_prot_creator->owner;
679
680         if (sk->sk_destruct)
681                 sk->sk_destruct(sk);
682
683         filter = sk->sk_filter;
684         if (filter) {
685                 sk_filter_release(sk, filter);
686                 sk->sk_filter = NULL;
687         }
688
689         sock_disable_timestamp(sk);
690
691         if (atomic_read(&sk->sk_omem_alloc))
692                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
693                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
694
695         security_sk_free(sk);
696         if (sk->sk_prot_creator->slab != NULL)
697                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
698         else
699                 kfree(sk);
700         module_put(owner);
701 }
702
703 struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
704 {
705         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
706
707         if (newsk != NULL) {
708                 struct sk_filter *filter;
709
710                 memcpy(newsk, sk, sk->sk_prot->obj_size);
711
712                 /* SANITY */
713                 sk_node_init(&newsk->sk_node);
714                 sock_lock_init(newsk);
715                 bh_lock_sock(newsk);
716
717                 atomic_set(&newsk->sk_rmem_alloc, 0);
718                 atomic_set(&newsk->sk_wmem_alloc, 0);
719                 atomic_set(&newsk->sk_omem_alloc, 0);
720                 skb_queue_head_init(&newsk->sk_receive_queue);
721                 skb_queue_head_init(&newsk->sk_write_queue);
722
723                 rwlock_init(&newsk->sk_dst_lock);
724                 rwlock_init(&newsk->sk_callback_lock);
725
726                 newsk->sk_dst_cache     = NULL;
727                 newsk->sk_wmem_queued   = 0;
728                 newsk->sk_forward_alloc = 0;
729                 newsk->sk_send_head     = NULL;
730                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
731                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
732
733                 sock_reset_flag(newsk, SOCK_DONE);
734                 skb_queue_head_init(&newsk->sk_error_queue);
735
736                 filter = newsk->sk_filter;
737                 if (filter != NULL)
738                         sk_filter_charge(newsk, filter);
739
740                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
741                         /* It is still raw copy of parent, so invalidate
742                          * destructor and make plain sk_free() */
743                         newsk->sk_destruct = NULL;
744                         sk_free(newsk);
745                         newsk = NULL;
746                         goto out;
747                 }
748
749                 newsk->sk_err      = 0;
750                 newsk->sk_priority = 0;
751                 atomic_set(&newsk->sk_refcnt, 2);
752
753                 /*
754                  * Increment the counter in the same struct proto as the master
755                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
756                  * is the same as sk->sk_prot->socks, as this field was copied
757                  * with memcpy).
758                  *
759                  * This _changes_ the previous behaviour, where
760                  * tcp_create_openreq_child always was incrementing the
761                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
762                  * to be taken into account in all callers. -acme
763                  */
764                 sk_refcnt_debug_inc(newsk);
765                 newsk->sk_socket = NULL;
766                 newsk->sk_sleep  = NULL;
767
768                 if (newsk->sk_prot->sockets_allocated)
769                         atomic_inc(newsk->sk_prot->sockets_allocated);
770         }
771 out:
772         return newsk;
773 }
774
775 EXPORT_SYMBOL_GPL(sk_clone);
776
777 void __init sk_init(void)
778 {
779         if (num_physpages <= 4096) {
780                 sysctl_wmem_max = 32767;
781                 sysctl_rmem_max = 32767;
782                 sysctl_wmem_default = 32767;
783                 sysctl_rmem_default = 32767;
784         } else if (num_physpages >= 131072) {
785                 sysctl_wmem_max = 131071;
786                 sysctl_rmem_max = 131071;
787         }
788 }
789
790 /*
791  *      Simple resource managers for sockets.
792  */
793
794
795 /* 
796  * Write buffer destructor automatically called from kfree_skb. 
797  */
798 void sock_wfree(struct sk_buff *skb)
799 {
800         struct sock *sk = skb->sk;
801
802         /* In case it might be waiting for more memory. */
803         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
804         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
805                 sk->sk_write_space(sk);
806         sock_put(sk);
807 }
808
809 /* 
810  * Read buffer destructor automatically called from kfree_skb. 
811  */
812 void sock_rfree(struct sk_buff *skb)
813 {
814         struct sock *sk = skb->sk;
815
816         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
817 }
818
819
820 int sock_i_uid(struct sock *sk)
821 {
822         int uid;
823
824         read_lock(&sk->sk_callback_lock);
825         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
826         read_unlock(&sk->sk_callback_lock);
827         return uid;
828 }
829
830 unsigned long sock_i_ino(struct sock *sk)
831 {
832         unsigned long ino;
833
834         read_lock(&sk->sk_callback_lock);
835         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
836         read_unlock(&sk->sk_callback_lock);
837         return ino;
838 }
839
840 /*
841  * Allocate a skb from the socket's send buffer.
842  */
843 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
844                              unsigned int __nocast priority)
845 {
846         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
847                 struct sk_buff * skb = alloc_skb(size, priority);
848                 if (skb) {
849                         skb_set_owner_w(skb, sk);
850                         return skb;
851                 }
852         }
853         return NULL;
854 }
855
856 /*
857  * Allocate a skb from the socket's receive buffer.
858  */ 
859 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
860                              unsigned int __nocast priority)
861 {
862         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
863                 struct sk_buff *skb = alloc_skb(size, priority);
864                 if (skb) {
865                         skb_set_owner_r(skb, sk);
866                         return skb;
867                 }
868         }
869         return NULL;
870 }
871
872 /* 
873  * Allocate a memory block from the socket's option memory buffer.
874  */ 
875 void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
876 {
877         if ((unsigned)size <= sysctl_optmem_max &&
878             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
879                 void *mem;
880                 /* First do the add, to avoid the race if kmalloc
881                  * might sleep.
882                  */
883                 atomic_add(size, &sk->sk_omem_alloc);
884                 mem = kmalloc(size, priority);
885                 if (mem)
886                         return mem;
887                 atomic_sub(size, &sk->sk_omem_alloc);
888         }
889         return NULL;
890 }
891
892 /*
893  * Free an option memory block.
894  */
895 void sock_kfree_s(struct sock *sk, void *mem, int size)
896 {
897         kfree(mem);
898         atomic_sub(size, &sk->sk_omem_alloc);
899 }
900
901 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
902    I think, these locks should be removed for datagram sockets.
903  */
904 static long sock_wait_for_wmem(struct sock * sk, long timeo)
905 {
906         DEFINE_WAIT(wait);
907
908         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
909         for (;;) {
910                 if (!timeo)
911                         break;
912                 if (signal_pending(current))
913                         break;
914                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
915                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
916                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
917                         break;
918                 if (sk->sk_shutdown & SEND_SHUTDOWN)
919                         break;
920                 if (sk->sk_err)
921                         break;
922                 timeo = schedule_timeout(timeo);
923         }
924         finish_wait(sk->sk_sleep, &wait);
925         return timeo;
926 }
927
928
929 /*
930  *      Generic send/receive buffer handlers
931  */
932
933 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
934                                             unsigned long header_len,
935                                             unsigned long data_len,
936                                             int noblock, int *errcode)
937 {
938         struct sk_buff *skb;
939         unsigned int gfp_mask;
940         long timeo;
941         int err;
942
943         gfp_mask = sk->sk_allocation;
944         if (gfp_mask & __GFP_WAIT)
945                 gfp_mask |= __GFP_REPEAT;
946
947         timeo = sock_sndtimeo(sk, noblock);
948         while (1) {
949                 err = sock_error(sk);
950                 if (err != 0)
951                         goto failure;
952
953                 err = -EPIPE;
954                 if (sk->sk_shutdown & SEND_SHUTDOWN)
955                         goto failure;
956
957                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
958                         skb = alloc_skb(header_len, sk->sk_allocation);
959                         if (skb) {
960                                 int npages;
961                                 int i;
962
963                                 /* No pages, we're done... */
964                                 if (!data_len)
965                                         break;
966
967                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
968                                 skb->truesize += data_len;
969                                 skb_shinfo(skb)->nr_frags = npages;
970                                 for (i = 0; i < npages; i++) {
971                                         struct page *page;
972                                         skb_frag_t *frag;
973
974                                         page = alloc_pages(sk->sk_allocation, 0);
975                                         if (!page) {
976                                                 err = -ENOBUFS;
977                                                 skb_shinfo(skb)->nr_frags = i;
978                                                 kfree_skb(skb);
979                                                 goto failure;
980                                         }
981
982                                         frag = &skb_shinfo(skb)->frags[i];
983                                         frag->page = page;
984                                         frag->page_offset = 0;
985                                         frag->size = (data_len >= PAGE_SIZE ?
986                                                       PAGE_SIZE :
987                                                       data_len);
988                                         data_len -= PAGE_SIZE;
989                                 }
990
991                                 /* Full success... */
992                                 break;
993                         }
994                         err = -ENOBUFS;
995                         goto failure;
996                 }
997                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
998                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
999                 err = -EAGAIN;
1000                 if (!timeo)
1001                         goto failure;
1002                 if (signal_pending(current))
1003                         goto interrupted;
1004                 timeo = sock_wait_for_wmem(sk, timeo);
1005         }
1006
1007         skb_set_owner_w(skb, sk);
1008         return skb;
1009
1010 interrupted:
1011         err = sock_intr_errno(timeo);
1012 failure:
1013         *errcode = err;
1014         return NULL;
1015 }
1016
1017 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1018                                     int noblock, int *errcode)
1019 {
1020         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1021 }
1022
1023 static void __lock_sock(struct sock *sk)
1024 {
1025         DEFINE_WAIT(wait);
1026
1027         for(;;) {
1028                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1029                                         TASK_UNINTERRUPTIBLE);
1030                 spin_unlock_bh(&sk->sk_lock.slock);
1031                 schedule();
1032                 spin_lock_bh(&sk->sk_lock.slock);
1033                 if(!sock_owned_by_user(sk))
1034                         break;
1035         }
1036         finish_wait(&sk->sk_lock.wq, &wait);
1037 }
1038
1039 static void __release_sock(struct sock *sk)
1040 {
1041         struct sk_buff *skb = sk->sk_backlog.head;
1042
1043         do {
1044                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1045                 bh_unlock_sock(sk);
1046
1047                 do {
1048                         struct sk_buff *next = skb->next;
1049
1050                         skb->next = NULL;
1051                         sk->sk_backlog_rcv(sk, skb);
1052
1053                         /*
1054                          * We are in process context here with softirqs
1055                          * disabled, use cond_resched_softirq() to preempt.
1056                          * This is safe to do because we've taken the backlog
1057                          * queue private:
1058                          */
1059                         cond_resched_softirq();
1060
1061                         skb = next;
1062                 } while (skb != NULL);
1063
1064                 bh_lock_sock(sk);
1065         } while((skb = sk->sk_backlog.head) != NULL);
1066 }
1067
1068 /**
1069  * sk_wait_data - wait for data to arrive at sk_receive_queue
1070  * @sk:    sock to wait on
1071  * @timeo: for how long
1072  *
1073  * Now socket state including sk->sk_err is changed only under lock,
1074  * hence we may omit checks after joining wait queue.
1075  * We check receive queue before schedule() only as optimization;
1076  * it is very likely that release_sock() added new data.
1077  */
1078 int sk_wait_data(struct sock *sk, long *timeo)
1079 {
1080         int rc;
1081         DEFINE_WAIT(wait);
1082
1083         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1084         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1085         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1086         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1087         finish_wait(sk->sk_sleep, &wait);
1088         return rc;
1089 }
1090
1091 EXPORT_SYMBOL(sk_wait_data);
1092
1093 /*
1094  * Set of default routines for initialising struct proto_ops when
1095  * the protocol does not support a particular function. In certain
1096  * cases where it makes no sense for a protocol to have a "do nothing"
1097  * function, some default processing is provided.
1098  */
1099
1100 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1101 {
1102         return -EOPNOTSUPP;
1103 }
1104
1105 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1106                     int len, int flags)
1107 {
1108         return -EOPNOTSUPP;
1109 }
1110
1111 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1112 {
1113         return -EOPNOTSUPP;
1114 }
1115
1116 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1117 {
1118         return -EOPNOTSUPP;
1119 }
1120
1121 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1122                     int *len, int peer)
1123 {
1124         return -EOPNOTSUPP;
1125 }
1126
1127 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1128 {
1129         return 0;
1130 }
1131
1132 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1133 {
1134         return -EOPNOTSUPP;
1135 }
1136
1137 int sock_no_listen(struct socket *sock, int backlog)
1138 {
1139         return -EOPNOTSUPP;
1140 }
1141
1142 int sock_no_shutdown(struct socket *sock, int how)
1143 {
1144         return -EOPNOTSUPP;
1145 }
1146
1147 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1148                     char __user *optval, int optlen)
1149 {
1150         return -EOPNOTSUPP;
1151 }
1152
1153 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1154                     char __user *optval, int __user *optlen)
1155 {
1156         return -EOPNOTSUPP;
1157 }
1158
1159 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1160                     size_t len)
1161 {
1162         return -EOPNOTSUPP;
1163 }
1164
1165 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1166                     size_t len, int flags)
1167 {
1168         return -EOPNOTSUPP;
1169 }
1170
1171 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1172 {
1173         /* Mirror missing mmap method error code */
1174         return -ENODEV;
1175 }
1176
1177 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1178 {
1179         ssize_t res;
1180         struct msghdr msg = {.msg_flags = flags};
1181         struct kvec iov;
1182         char *kaddr = kmap(page);
1183         iov.iov_base = kaddr + offset;
1184         iov.iov_len = size;
1185         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1186         kunmap(page);
1187         return res;
1188 }
1189
1190 /*
1191  *      Default Socket Callbacks
1192  */
1193
1194 static void sock_def_wakeup(struct sock *sk)
1195 {
1196         read_lock(&sk->sk_callback_lock);
1197         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1198                 wake_up_interruptible_all(sk->sk_sleep);
1199         read_unlock(&sk->sk_callback_lock);
1200 }
1201
1202 static void sock_def_error_report(struct sock *sk)
1203 {
1204         read_lock(&sk->sk_callback_lock);
1205         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1206                 wake_up_interruptible(sk->sk_sleep);
1207         sk_wake_async(sk,0,POLL_ERR); 
1208         read_unlock(&sk->sk_callback_lock);
1209 }
1210
1211 static void sock_def_readable(struct sock *sk, int len)
1212 {
1213         read_lock(&sk->sk_callback_lock);
1214         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1215                 wake_up_interruptible(sk->sk_sleep);
1216         sk_wake_async(sk,1,POLL_IN);
1217         read_unlock(&sk->sk_callback_lock);
1218 }
1219
1220 static void sock_def_write_space(struct sock *sk)
1221 {
1222         read_lock(&sk->sk_callback_lock);
1223
1224         /* Do not wake up a writer until he can make "significant"
1225          * progress.  --DaveM
1226          */
1227         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1228                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1229                         wake_up_interruptible(sk->sk_sleep);
1230
1231                 /* Should agree with poll, otherwise some programs break */
1232                 if (sock_writeable(sk))
1233                         sk_wake_async(sk, 2, POLL_OUT);
1234         }
1235
1236         read_unlock(&sk->sk_callback_lock);
1237 }
1238
1239 static void sock_def_destruct(struct sock *sk)
1240 {
1241         if (sk->sk_protinfo)
1242                 kfree(sk->sk_protinfo);
1243 }
1244
1245 void sk_send_sigurg(struct sock *sk)
1246 {
1247         if (sk->sk_socket && sk->sk_socket->file)
1248                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1249                         sk_wake_async(sk, 3, POLL_PRI);
1250 }
1251
1252 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1253                     unsigned long expires)
1254 {
1255         if (!mod_timer(timer, expires))
1256                 sock_hold(sk);
1257 }
1258
1259 EXPORT_SYMBOL(sk_reset_timer);
1260
1261 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1262 {
1263         if (timer_pending(timer) && del_timer(timer))
1264                 __sock_put(sk);
1265 }
1266
1267 EXPORT_SYMBOL(sk_stop_timer);
1268
1269 void sock_init_data(struct socket *sock, struct sock *sk)
1270 {
1271         skb_queue_head_init(&sk->sk_receive_queue);
1272         skb_queue_head_init(&sk->sk_write_queue);
1273         skb_queue_head_init(&sk->sk_error_queue);
1274
1275         sk->sk_send_head        =       NULL;
1276
1277         init_timer(&sk->sk_timer);
1278         
1279         sk->sk_allocation       =       GFP_KERNEL;
1280         sk->sk_rcvbuf           =       sysctl_rmem_default;
1281         sk->sk_sndbuf           =       sysctl_wmem_default;
1282         sk->sk_state            =       TCP_CLOSE;
1283         sk->sk_socket           =       sock;
1284
1285         sock_set_flag(sk, SOCK_ZAPPED);
1286
1287         if(sock)
1288         {
1289                 sk->sk_type     =       sock->type;
1290                 sk->sk_sleep    =       &sock->wait;
1291                 sock->sk        =       sk;
1292         } else
1293                 sk->sk_sleep    =       NULL;
1294
1295         rwlock_init(&sk->sk_dst_lock);
1296         rwlock_init(&sk->sk_callback_lock);
1297
1298         sk->sk_state_change     =       sock_def_wakeup;
1299         sk->sk_data_ready       =       sock_def_readable;
1300         sk->sk_write_space      =       sock_def_write_space;
1301         sk->sk_error_report     =       sock_def_error_report;
1302         sk->sk_destruct         =       sock_def_destruct;
1303
1304         sk->sk_sndmsg_page      =       NULL;
1305         sk->sk_sndmsg_off       =       0;
1306
1307         sk->sk_peercred.pid     =       0;
1308         sk->sk_peercred.uid     =       -1;
1309         sk->sk_peercred.gid     =       -1;
1310         sk->sk_write_pending    =       0;
1311         sk->sk_rcvlowat         =       1;
1312         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1313         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1314
1315         sk->sk_stamp.tv_sec     = -1L;
1316         sk->sk_stamp.tv_usec    = -1L;
1317
1318         atomic_set(&sk->sk_refcnt, 1);
1319 }
1320
1321 void fastcall lock_sock(struct sock *sk)
1322 {
1323         might_sleep();
1324         spin_lock_bh(&(sk->sk_lock.slock));
1325         if (sk->sk_lock.owner)
1326                 __lock_sock(sk);
1327         sk->sk_lock.owner = (void *)1;
1328         spin_unlock_bh(&(sk->sk_lock.slock));
1329 }
1330
1331 EXPORT_SYMBOL(lock_sock);
1332
1333 void fastcall release_sock(struct sock *sk)
1334 {
1335         spin_lock_bh(&(sk->sk_lock.slock));
1336         if (sk->sk_backlog.tail)
1337                 __release_sock(sk);
1338         sk->sk_lock.owner = NULL;
1339         if (waitqueue_active(&(sk->sk_lock.wq)))
1340                 wake_up(&(sk->sk_lock.wq));
1341         spin_unlock_bh(&(sk->sk_lock.slock));
1342 }
1343 EXPORT_SYMBOL(release_sock);
1344
1345 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1346
1347         if (!sock_flag(sk, SOCK_TIMESTAMP))
1348                 sock_enable_timestamp(sk);
1349         if (sk->sk_stamp.tv_sec == -1) 
1350                 return -ENOENT;
1351         if (sk->sk_stamp.tv_sec == 0)
1352                 do_gettimeofday(&sk->sk_stamp);
1353         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1354                 -EFAULT : 0; 
1355
1356 EXPORT_SYMBOL(sock_get_timestamp);
1357
1358 void sock_enable_timestamp(struct sock *sk)
1359 {       
1360         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1361                 sock_set_flag(sk, SOCK_TIMESTAMP);
1362                 net_enable_timestamp();
1363         }
1364 }
1365 EXPORT_SYMBOL(sock_enable_timestamp); 
1366
1367 /*
1368  *      Get a socket option on an socket.
1369  *
1370  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1371  *      asynchronous errors should be reported by getsockopt. We assume
1372  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1373  */
1374 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1375                            char __user *optval, int __user *optlen)
1376 {
1377         struct sock *sk = sock->sk;
1378
1379         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1380 }
1381
1382 EXPORT_SYMBOL(sock_common_getsockopt);
1383
1384 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1385                         struct msghdr *msg, size_t size, int flags)
1386 {
1387         struct sock *sk = sock->sk;
1388         int addr_len = 0;
1389         int err;
1390
1391         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1392                                    flags & ~MSG_DONTWAIT, &addr_len);
1393         if (err >= 0)
1394                 msg->msg_namelen = addr_len;
1395         return err;
1396 }
1397
1398 EXPORT_SYMBOL(sock_common_recvmsg);
1399
1400 /*
1401  *      Set socket options on an inet socket.
1402  */
1403 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1404                            char __user *optval, int optlen)
1405 {
1406         struct sock *sk = sock->sk;
1407
1408         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1409 }
1410
1411 EXPORT_SYMBOL(sock_common_setsockopt);
1412
1413 void sk_common_release(struct sock *sk)
1414 {
1415         if (sk->sk_prot->destroy)
1416                 sk->sk_prot->destroy(sk);
1417
1418         /*
1419          * Observation: when sock_common_release is called, processes have
1420          * no access to socket. But net still has.
1421          * Step one, detach it from networking:
1422          *
1423          * A. Remove from hash tables.
1424          */
1425
1426         sk->sk_prot->unhash(sk);
1427
1428         /*
1429          * In this point socket cannot receive new packets, but it is possible
1430          * that some packets are in flight because some CPU runs receiver and
1431          * did hash table lookup before we unhashed socket. They will achieve
1432          * receive queue and will be purged by socket destructor.
1433          *
1434          * Also we still have packets pending on receive queue and probably,
1435          * our own packets waiting in device queues. sock_destroy will drain
1436          * receive queue, but transmitted packets will delay socket destruction
1437          * until the last reference will be released.
1438          */
1439
1440         sock_orphan(sk);
1441
1442         xfrm_sk_free_policy(sk);
1443
1444         sk_refcnt_debug_release(sk);
1445         sock_put(sk);
1446 }
1447
1448 EXPORT_SYMBOL(sk_common_release);
1449
1450 static DEFINE_RWLOCK(proto_list_lock);
1451 static LIST_HEAD(proto_list);
1452
1453 int proto_register(struct proto *prot, int alloc_slab)
1454 {
1455         char *request_sock_slab_name = NULL;
1456         char *timewait_sock_slab_name;
1457         int rc = -ENOBUFS;
1458
1459         if (alloc_slab) {
1460                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1461                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1462
1463                 if (prot->slab == NULL) {
1464                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1465                                prot->name);
1466                         goto out;
1467                 }
1468
1469                 if (prot->rsk_prot != NULL) {
1470                         static const char mask[] = "request_sock_%s";
1471
1472                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1473                         if (request_sock_slab_name == NULL)
1474                                 goto out_free_sock_slab;
1475
1476                         sprintf(request_sock_slab_name, mask, prot->name);
1477                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1478                                                                  prot->rsk_prot->obj_size, 0,
1479                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1480
1481                         if (prot->rsk_prot->slab == NULL) {
1482                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1483                                        prot->name);
1484                                 goto out_free_request_sock_slab_name;
1485                         }
1486                 }
1487
1488                 if (prot->twsk_obj_size) {
1489                         static const char mask[] = "tw_sock_%s";
1490
1491                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1492
1493                         if (timewait_sock_slab_name == NULL)
1494                                 goto out_free_request_sock_slab;
1495
1496                         sprintf(timewait_sock_slab_name, mask, prot->name);
1497                         prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
1498                                                             prot->twsk_obj_size,
1499                                                             0, SLAB_HWCACHE_ALIGN,
1500                                                             NULL, NULL);
1501                         if (prot->twsk_slab == NULL)
1502                                 goto out_free_timewait_sock_slab_name;
1503                 }
1504         }
1505
1506         write_lock(&proto_list_lock);
1507         list_add(&prot->node, &proto_list);
1508         write_unlock(&proto_list_lock);
1509         rc = 0;
1510 out:
1511         return rc;
1512 out_free_timewait_sock_slab_name:
1513         kfree(timewait_sock_slab_name);
1514 out_free_request_sock_slab:
1515         if (prot->rsk_prot && prot->rsk_prot->slab) {
1516                 kmem_cache_destroy(prot->rsk_prot->slab);
1517                 prot->rsk_prot->slab = NULL;
1518         }
1519 out_free_request_sock_slab_name:
1520         kfree(request_sock_slab_name);
1521 out_free_sock_slab:
1522         kmem_cache_destroy(prot->slab);
1523         prot->slab = NULL;
1524         goto out;
1525 }
1526
1527 EXPORT_SYMBOL(proto_register);
1528
1529 void proto_unregister(struct proto *prot)
1530 {
1531         write_lock(&proto_list_lock);
1532
1533         if (prot->slab != NULL) {
1534                 kmem_cache_destroy(prot->slab);
1535                 prot->slab = NULL;
1536         }
1537
1538         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1539                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1540
1541                 kmem_cache_destroy(prot->rsk_prot->slab);
1542                 kfree(name);
1543                 prot->rsk_prot->slab = NULL;
1544         }
1545
1546         if (prot->twsk_slab != NULL) {
1547                 const char *name = kmem_cache_name(prot->twsk_slab);
1548
1549                 kmem_cache_destroy(prot->twsk_slab);
1550                 kfree(name);
1551                 prot->twsk_slab = NULL;
1552         }
1553
1554         list_del(&prot->node);
1555         write_unlock(&proto_list_lock);
1556 }
1557
1558 EXPORT_SYMBOL(proto_unregister);
1559
1560 #ifdef CONFIG_PROC_FS
1561 static inline struct proto *__proto_head(void)
1562 {
1563         return list_entry(proto_list.next, struct proto, node);
1564 }
1565
1566 static inline struct proto *proto_head(void)
1567 {
1568         return list_empty(&proto_list) ? NULL : __proto_head();
1569 }
1570
1571 static inline struct proto *proto_next(struct proto *proto)
1572 {
1573         return proto->node.next == &proto_list ? NULL :
1574                 list_entry(proto->node.next, struct proto, node);
1575 }
1576
1577 static inline struct proto *proto_get_idx(loff_t pos)
1578 {
1579         struct proto *proto;
1580         loff_t i = 0;
1581
1582         list_for_each_entry(proto, &proto_list, node)
1583                 if (i++ == pos)
1584                         goto out;
1585
1586         proto = NULL;
1587 out:
1588         return proto;
1589 }
1590
1591 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1592 {
1593         read_lock(&proto_list_lock);
1594         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1595 }
1596
1597 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1598 {
1599         ++*pos;
1600         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1601 }
1602
1603 static void proto_seq_stop(struct seq_file *seq, void *v)
1604 {
1605         read_unlock(&proto_list_lock);
1606 }
1607
1608 static char proto_method_implemented(const void *method)
1609 {
1610         return method == NULL ? 'n' : 'y';
1611 }
1612
1613 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1614 {
1615         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1616                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1617                    proto->name,
1618                    proto->obj_size,
1619                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1620                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1621                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1622                    proto->max_header,
1623                    proto->slab == NULL ? "no" : "yes",
1624                    module_name(proto->owner),
1625                    proto_method_implemented(proto->close),
1626                    proto_method_implemented(proto->connect),
1627                    proto_method_implemented(proto->disconnect),
1628                    proto_method_implemented(proto->accept),
1629                    proto_method_implemented(proto->ioctl),
1630                    proto_method_implemented(proto->init),
1631                    proto_method_implemented(proto->destroy),
1632                    proto_method_implemented(proto->shutdown),
1633                    proto_method_implemented(proto->setsockopt),
1634                    proto_method_implemented(proto->getsockopt),
1635                    proto_method_implemented(proto->sendmsg),
1636                    proto_method_implemented(proto->recvmsg),
1637                    proto_method_implemented(proto->sendpage),
1638                    proto_method_implemented(proto->bind),
1639                    proto_method_implemented(proto->backlog_rcv),
1640                    proto_method_implemented(proto->hash),
1641                    proto_method_implemented(proto->unhash),
1642                    proto_method_implemented(proto->get_port),
1643                    proto_method_implemented(proto->enter_memory_pressure));
1644 }
1645
1646 static int proto_seq_show(struct seq_file *seq, void *v)
1647 {
1648         if (v == SEQ_START_TOKEN)
1649                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1650                            "protocol",
1651                            "size",
1652                            "sockets",
1653                            "memory",
1654                            "press",
1655                            "maxhdr",
1656                            "slab",
1657                            "module",
1658                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1659         else
1660                 proto_seq_printf(seq, v);
1661         return 0;
1662 }
1663
1664 static struct seq_operations proto_seq_ops = {
1665         .start  = proto_seq_start,
1666         .next   = proto_seq_next,
1667         .stop   = proto_seq_stop,
1668         .show   = proto_seq_show,
1669 };
1670
1671 static int proto_seq_open(struct inode *inode, struct file *file)
1672 {
1673         return seq_open(file, &proto_seq_ops);
1674 }
1675
1676 static struct file_operations proto_seq_fops = {
1677         .owner          = THIS_MODULE,
1678         .open           = proto_seq_open,
1679         .read           = seq_read,
1680         .llseek         = seq_lseek,
1681         .release        = seq_release,
1682 };
1683
1684 static int __init proto_init(void)
1685 {
1686         /* register /proc/net/protocols */
1687         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1688 }
1689
1690 subsys_initcall(proto_init);
1691
1692 #endif /* PROC_FS */
1693
1694 EXPORT_SYMBOL(sk_alloc);
1695 EXPORT_SYMBOL(sk_free);
1696 EXPORT_SYMBOL(sk_send_sigurg);
1697 EXPORT_SYMBOL(sock_alloc_send_skb);
1698 EXPORT_SYMBOL(sock_init_data);
1699 EXPORT_SYMBOL(sock_kfree_s);
1700 EXPORT_SYMBOL(sock_kmalloc);
1701 EXPORT_SYMBOL(sock_no_accept);
1702 EXPORT_SYMBOL(sock_no_bind);
1703 EXPORT_SYMBOL(sock_no_connect);
1704 EXPORT_SYMBOL(sock_no_getname);
1705 EXPORT_SYMBOL(sock_no_getsockopt);
1706 EXPORT_SYMBOL(sock_no_ioctl);
1707 EXPORT_SYMBOL(sock_no_listen);
1708 EXPORT_SYMBOL(sock_no_mmap);
1709 EXPORT_SYMBOL(sock_no_poll);
1710 EXPORT_SYMBOL(sock_no_recvmsg);
1711 EXPORT_SYMBOL(sock_no_sendmsg);
1712 EXPORT_SYMBOL(sock_no_sendpage);
1713 EXPORT_SYMBOL(sock_no_setsockopt);
1714 EXPORT_SYMBOL(sock_no_shutdown);
1715 EXPORT_SYMBOL(sock_no_socketpair);
1716 EXPORT_SYMBOL(sock_rfree);
1717 EXPORT_SYMBOL(sock_setsockopt);
1718 EXPORT_SYMBOL(sock_wfree);
1719 EXPORT_SYMBOL(sock_wmalloc);
1720 EXPORT_SYMBOL(sock_i_uid);
1721 EXPORT_SYMBOL(sock_i_ino);
1722 EXPORT_SYMBOL(sysctl_optmem_max);
1723 #ifdef CONFIG_SYSCTL
1724 EXPORT_SYMBOL(sysctl_rmem_max);
1725 EXPORT_SYMBOL(sysctl_wmem_max);
1726 #endif