Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /* Take into consideration the size of the struct sk_buff overhead in the
134  * determination of these values, since that is non-constant across
135  * platforms.  This makes socket queueing behavior and performance
136  * not depend upon such differences.
137  */
138 #define _SK_MEM_PACKETS         256
139 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
140 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143 /* Run time adjustable parameters. */
144 __u32 sysctl_wmem_max = SK_WMEM_MAX;
145 __u32 sysctl_rmem_max = SK_RMEM_MAX;
146 __u32 sysctl_wmem_default = SK_WMEM_MAX;
147 __u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149 /* Maximal space eaten by iovec or ancilliary data plus some space */
150 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153 {
154         struct timeval tv;
155
156         if (optlen < sizeof(tv))
157                 return -EINVAL;
158         if (copy_from_user(&tv, optval, sizeof(tv)))
159                 return -EFAULT;
160
161         *timeo_p = MAX_SCHEDULE_TIMEOUT;
162         if (tv.tv_sec == 0 && tv.tv_usec == 0)
163                 return 0;
164         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166         return 0;
167 }
168
169 static void sock_warn_obsolete_bsdism(const char *name)
170 {
171         static int warned;
172         static char warncomm[TASK_COMM_LEN];
173         if (strcmp(warncomm, current->comm) && warned < 5) { 
174                 strcpy(warncomm,  current->comm); 
175                 printk(KERN_WARNING "process `%s' is using obsolete "
176                        "%s SO_BSDCOMPAT\n", warncomm, name);
177                 warned++;
178         }
179 }
180
181 static void sock_disable_timestamp(struct sock *sk)
182 {       
183         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
184                 sock_reset_flag(sk, SOCK_TIMESTAMP);
185                 net_disable_timestamp();
186         }
187 }
188
189
190 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
191 {
192         int err = 0;
193         int skb_len;
194
195         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
196            number of warnings when compiling with -W --ANK
197          */
198         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
199             (unsigned)sk->sk_rcvbuf) {
200                 err = -ENOMEM;
201                 goto out;
202         }
203
204         /* It would be deadlock, if sock_queue_rcv_skb is used
205            with socket lock! We assume that users of this
206            function are lock free.
207         */
208         err = sk_filter(sk, skb, 1);
209         if (err)
210                 goto out;
211
212         skb->dev = NULL;
213         skb_set_owner_r(skb, sk);
214
215         /* Cache the SKB length before we tack it onto the receive
216          * queue.  Once it is added it no longer belongs to us and
217          * may be freed by other threads of control pulling packets
218          * from the queue.
219          */
220         skb_len = skb->len;
221
222         skb_queue_tail(&sk->sk_receive_queue, skb);
223
224         if (!sock_flag(sk, SOCK_DEAD))
225                 sk->sk_data_ready(sk, skb_len);
226 out:
227         return err;
228 }
229 EXPORT_SYMBOL(sock_queue_rcv_skb);
230
231 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
232 {
233         int rc = NET_RX_SUCCESS;
234
235         if (sk_filter(sk, skb, 0))
236                 goto discard_and_relse;
237
238         skb->dev = NULL;
239
240         bh_lock_sock(sk);
241         if (!sock_owned_by_user(sk))
242                 rc = sk->sk_backlog_rcv(sk, skb);
243         else
244                 sk_add_backlog(sk, skb);
245         bh_unlock_sock(sk);
246 out:
247         sock_put(sk);
248         return rc;
249 discard_and_relse:
250         kfree_skb(skb);
251         goto out;
252 }
253 EXPORT_SYMBOL(sk_receive_skb);
254
255 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
256 {
257         struct dst_entry *dst = sk->sk_dst_cache;
258
259         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
260                 sk->sk_dst_cache = NULL;
261                 dst_release(dst);
262                 return NULL;
263         }
264
265         return dst;
266 }
267 EXPORT_SYMBOL(__sk_dst_check);
268
269 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
270 {
271         struct dst_entry *dst = sk_dst_get(sk);
272
273         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
274                 sk_dst_reset(sk);
275                 dst_release(dst);
276                 return NULL;
277         }
278
279         return dst;
280 }
281 EXPORT_SYMBOL(sk_dst_check);
282
283 /*
284  *      This is meant for all protocols to use and covers goings on
285  *      at the socket level. Everything here is generic.
286  */
287
288 int sock_setsockopt(struct socket *sock, int level, int optname,
289                     char __user *optval, int optlen)
290 {
291         struct sock *sk=sock->sk;
292         struct sk_filter *filter;
293         int val;
294         int valbool;
295         struct linger ling;
296         int ret = 0;
297         
298         /*
299          *      Options without arguments
300          */
301
302 #ifdef SO_DONTLINGER            /* Compatibility item... */
303         if (optname == SO_DONTLINGER) {
304                 lock_sock(sk);
305                 sock_reset_flag(sk, SOCK_LINGER);
306                 release_sock(sk);
307                 return 0;
308         }
309 #endif
310         
311         if(optlen<sizeof(int))
312                 return(-EINVAL);
313         
314         if (get_user(val, (int __user *)optval))
315                 return -EFAULT;
316         
317         valbool = val?1:0;
318
319         lock_sock(sk);
320
321         switch(optname) 
322         {
323                 case SO_DEBUG:  
324                         if(val && !capable(CAP_NET_ADMIN))
325                         {
326                                 ret = -EACCES;
327                         }
328                         else if (valbool)
329                                 sock_set_flag(sk, SOCK_DBG);
330                         else
331                                 sock_reset_flag(sk, SOCK_DBG);
332                         break;
333                 case SO_REUSEADDR:
334                         sk->sk_reuse = valbool;
335                         break;
336                 case SO_TYPE:
337                 case SO_ERROR:
338                         ret = -ENOPROTOOPT;
339                         break;
340                 case SO_DONTROUTE:
341                         if (valbool)
342                                 sock_set_flag(sk, SOCK_LOCALROUTE);
343                         else
344                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
345                         break;
346                 case SO_BROADCAST:
347                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
348                         break;
349                 case SO_SNDBUF:
350                         /* Don't error on this BSD doesn't and if you think
351                            about it this is right. Otherwise apps have to
352                            play 'guess the biggest size' games. RCVBUF/SNDBUF
353                            are treated in BSD as hints */
354                            
355                         if (val > sysctl_wmem_max)
356                                 val = sysctl_wmem_max;
357 set_sndbuf:
358                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
359                         if ((val * 2) < SOCK_MIN_SNDBUF)
360                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
361                         else
362                                 sk->sk_sndbuf = val * 2;
363
364                         /*
365                          *      Wake up sending tasks if we
366                          *      upped the value.
367                          */
368                         sk->sk_write_space(sk);
369                         break;
370
371                 case SO_SNDBUFFORCE:
372                         if (!capable(CAP_NET_ADMIN)) {
373                                 ret = -EPERM;
374                                 break;
375                         }
376                         goto set_sndbuf;
377
378                 case SO_RCVBUF:
379                         /* Don't error on this BSD doesn't and if you think
380                            about it this is right. Otherwise apps have to
381                            play 'guess the biggest size' games. RCVBUF/SNDBUF
382                            are treated in BSD as hints */
383                           
384                         if (val > sysctl_rmem_max)
385                                 val = sysctl_rmem_max;
386 set_rcvbuf:
387                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
388                         /* FIXME: is this lower bound the right one? */
389                         if ((val * 2) < SOCK_MIN_RCVBUF)
390                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
391                         else
392                                 sk->sk_rcvbuf = val * 2;
393                         break;
394
395                 case SO_RCVBUFFORCE:
396                         if (!capable(CAP_NET_ADMIN)) {
397                                 ret = -EPERM;
398                                 break;
399                         }
400                         goto set_rcvbuf;
401
402                 case SO_KEEPALIVE:
403 #ifdef CONFIG_INET
404                         if (sk->sk_protocol == IPPROTO_TCP)
405                                 tcp_set_keepalive(sk, valbool);
406 #endif
407                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
408                         break;
409
410                 case SO_OOBINLINE:
411                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
412                         break;
413
414                 case SO_NO_CHECK:
415                         sk->sk_no_check = valbool;
416                         break;
417
418                 case SO_PRIORITY:
419                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
420                                 sk->sk_priority = val;
421                         else
422                                 ret = -EPERM;
423                         break;
424
425                 case SO_LINGER:
426                         if(optlen<sizeof(ling)) {
427                                 ret = -EINVAL;  /* 1003.1g */
428                                 break;
429                         }
430                         if (copy_from_user(&ling,optval,sizeof(ling))) {
431                                 ret = -EFAULT;
432                                 break;
433                         }
434                         if (!ling.l_onoff)
435                                 sock_reset_flag(sk, SOCK_LINGER);
436                         else {
437 #if (BITS_PER_LONG == 32)
438                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
439                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
440                                 else
441 #endif
442                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
443                                 sock_set_flag(sk, SOCK_LINGER);
444                         }
445                         break;
446
447                 case SO_BSDCOMPAT:
448                         sock_warn_obsolete_bsdism("setsockopt");
449                         break;
450
451                 case SO_PASSCRED:
452                         if (valbool)
453                                 set_bit(SOCK_PASSCRED, &sock->flags);
454                         else
455                                 clear_bit(SOCK_PASSCRED, &sock->flags);
456                         break;
457
458                 case SO_TIMESTAMP:
459                         if (valbool)  {
460                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
461                                 sock_enable_timestamp(sk);
462                         } else
463                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
464                         break;
465
466                 case SO_RCVLOWAT:
467                         if (val < 0)
468                                 val = INT_MAX;
469                         sk->sk_rcvlowat = val ? : 1;
470                         break;
471
472                 case SO_RCVTIMEO:
473                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
474                         break;
475
476                 case SO_SNDTIMEO:
477                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
478                         break;
479
480 #ifdef CONFIG_NETDEVICES
481                 case SO_BINDTODEVICE:
482                 {
483                         char devname[IFNAMSIZ]; 
484
485                         /* Sorry... */ 
486                         if (!capable(CAP_NET_RAW)) {
487                                 ret = -EPERM;
488                                 break;
489                         }
490
491                         /* Bind this socket to a particular device like "eth0",
492                          * as specified in the passed interface name. If the
493                          * name is "" or the option length is zero the socket 
494                          * is not bound. 
495                          */ 
496
497                         if (!valbool) {
498                                 sk->sk_bound_dev_if = 0;
499                         } else {
500                                 if (optlen > IFNAMSIZ - 1)
501                                         optlen = IFNAMSIZ - 1;
502                                 memset(devname, 0, sizeof(devname));
503                                 if (copy_from_user(devname, optval, optlen)) {
504                                         ret = -EFAULT;
505                                         break;
506                                 }
507
508                                 /* Remove any cached route for this socket. */
509                                 sk_dst_reset(sk);
510
511                                 if (devname[0] == '\0') {
512                                         sk->sk_bound_dev_if = 0;
513                                 } else {
514                                         struct net_device *dev = dev_get_by_name(devname);
515                                         if (!dev) {
516                                                 ret = -ENODEV;
517                                                 break;
518                                         }
519                                         sk->sk_bound_dev_if = dev->ifindex;
520                                         dev_put(dev);
521                                 }
522                         }
523                         break;
524                 }
525 #endif
526
527
528                 case SO_ATTACH_FILTER:
529                         ret = -EINVAL;
530                         if (optlen == sizeof(struct sock_fprog)) {
531                                 struct sock_fprog fprog;
532
533                                 ret = -EFAULT;
534                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
535                                         break;
536
537                                 ret = sk_attach_filter(&fprog, sk);
538                         }
539                         break;
540
541                 case SO_DETACH_FILTER:
542                         spin_lock_bh(&sk->sk_lock.slock);
543                         filter = sk->sk_filter;
544                         if (filter) {
545                                 sk->sk_filter = NULL;
546                                 spin_unlock_bh(&sk->sk_lock.slock);
547                                 sk_filter_release(sk, filter);
548                                 break;
549                         }
550                         spin_unlock_bh(&sk->sk_lock.slock);
551                         ret = -ENONET;
552                         break;
553
554                 /* We implement the SO_SNDLOWAT etc to
555                    not be settable (1003.1g 5.3) */
556                 default:
557                         ret = -ENOPROTOOPT;
558                         break;
559         }
560         release_sock(sk);
561         return ret;
562 }
563
564
565 int sock_getsockopt(struct socket *sock, int level, int optname,
566                     char __user *optval, int __user *optlen)
567 {
568         struct sock *sk = sock->sk;
569         
570         union
571         {
572                 int val;
573                 struct linger ling;
574                 struct timeval tm;
575         } v;
576         
577         unsigned int lv = sizeof(int);
578         int len;
579         
580         if(get_user(len,optlen))
581                 return -EFAULT;
582         if(len < 0)
583                 return -EINVAL;
584                 
585         switch(optname) 
586         {
587                 case SO_DEBUG:          
588                         v.val = sock_flag(sk, SOCK_DBG);
589                         break;
590                 
591                 case SO_DONTROUTE:
592                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
593                         break;
594                 
595                 case SO_BROADCAST:
596                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
597                         break;
598
599                 case SO_SNDBUF:
600                         v.val = sk->sk_sndbuf;
601                         break;
602                 
603                 case SO_RCVBUF:
604                         v.val = sk->sk_rcvbuf;
605                         break;
606
607                 case SO_REUSEADDR:
608                         v.val = sk->sk_reuse;
609                         break;
610
611                 case SO_KEEPALIVE:
612                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
613                         break;
614
615                 case SO_TYPE:
616                         v.val = sk->sk_type;                            
617                         break;
618
619                 case SO_ERROR:
620                         v.val = -sock_error(sk);
621                         if(v.val==0)
622                                 v.val = xchg(&sk->sk_err_soft, 0);
623                         break;
624
625                 case SO_OOBINLINE:
626                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
627                         break;
628         
629                 case SO_NO_CHECK:
630                         v.val = sk->sk_no_check;
631                         break;
632
633                 case SO_PRIORITY:
634                         v.val = sk->sk_priority;
635                         break;
636                 
637                 case SO_LINGER: 
638                         lv              = sizeof(v.ling);
639                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
640                         v.ling.l_linger = sk->sk_lingertime / HZ;
641                         break;
642                                         
643                 case SO_BSDCOMPAT:
644                         sock_warn_obsolete_bsdism("getsockopt");
645                         break;
646
647                 case SO_TIMESTAMP:
648                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
649                         break;
650
651                 case SO_RCVTIMEO:
652                         lv=sizeof(struct timeval);
653                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
654                                 v.tm.tv_sec = 0;
655                                 v.tm.tv_usec = 0;
656                         } else {
657                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
658                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
659                         }
660                         break;
661
662                 case SO_SNDTIMEO:
663                         lv=sizeof(struct timeval);
664                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
665                                 v.tm.tv_sec = 0;
666                                 v.tm.tv_usec = 0;
667                         } else {
668                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
669                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
670                         }
671                         break;
672
673                 case SO_RCVLOWAT:
674                         v.val = sk->sk_rcvlowat;
675                         break;
676
677                 case SO_SNDLOWAT:
678                         v.val=1;
679                         break; 
680
681                 case SO_PASSCRED:
682                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
683                         break;
684
685                 case SO_PEERCRED:
686                         if (len > sizeof(sk->sk_peercred))
687                                 len = sizeof(sk->sk_peercred);
688                         if (copy_to_user(optval, &sk->sk_peercred, len))
689                                 return -EFAULT;
690                         goto lenout;
691
692                 case SO_PEERNAME:
693                 {
694                         char address[128];
695
696                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
697                                 return -ENOTCONN;
698                         if (lv < len)
699                                 return -EINVAL;
700                         if (copy_to_user(optval, address, len))
701                                 return -EFAULT;
702                         goto lenout;
703                 }
704
705                 /* Dubious BSD thing... Probably nobody even uses it, but
706                  * the UNIX standard wants it for whatever reason... -DaveM
707                  */
708                 case SO_ACCEPTCONN:
709                         v.val = sk->sk_state == TCP_LISTEN;
710                         break;
711
712                 case SO_PEERSEC:
713                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
714
715                 default:
716                         return(-ENOPROTOOPT);
717         }
718         if (len > lv)
719                 len = lv;
720         if (copy_to_user(optval, &v, len))
721                 return -EFAULT;
722 lenout:
723         if (put_user(len, optlen))
724                 return -EFAULT;
725         return 0;
726 }
727
728 /**
729  *      sk_alloc - All socket objects are allocated here
730  *      @family: protocol family
731  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
732  *      @prot: struct proto associated with this new sock instance
733  *      @zero_it: if we should zero the newly allocated sock
734  */
735 struct sock *sk_alloc(int family, gfp_t priority,
736                       struct proto *prot, int zero_it)
737 {
738         struct sock *sk = NULL;
739         kmem_cache_t *slab = prot->slab;
740
741         if (slab != NULL)
742                 sk = kmem_cache_alloc(slab, priority);
743         else
744                 sk = kmalloc(prot->obj_size, priority);
745
746         if (sk) {
747                 if (zero_it) {
748                         memset(sk, 0, prot->obj_size);
749                         sk->sk_family = family;
750                         /*
751                          * See comment in struct sock definition to understand
752                          * why we need sk_prot_creator -acme
753                          */
754                         sk->sk_prot = sk->sk_prot_creator = prot;
755                         sock_lock_init(sk);
756                 }
757                 
758                 if (security_sk_alloc(sk, family, priority))
759                         goto out_free;
760
761                 if (!try_module_get(prot->owner))
762                         goto out_free;
763         }
764         return sk;
765
766 out_free:
767         if (slab != NULL)
768                 kmem_cache_free(slab, sk);
769         else
770                 kfree(sk);
771         return NULL;
772 }
773
774 void sk_free(struct sock *sk)
775 {
776         struct sk_filter *filter;
777         struct module *owner = sk->sk_prot_creator->owner;
778
779         if (sk->sk_destruct)
780                 sk->sk_destruct(sk);
781
782         filter = sk->sk_filter;
783         if (filter) {
784                 sk_filter_release(sk, filter);
785                 sk->sk_filter = NULL;
786         }
787
788         sock_disable_timestamp(sk);
789
790         if (atomic_read(&sk->sk_omem_alloc))
791                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
792                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
793
794         security_sk_free(sk);
795         if (sk->sk_prot_creator->slab != NULL)
796                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
797         else
798                 kfree(sk);
799         module_put(owner);
800 }
801
802 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
803 {
804         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
805
806         if (newsk != NULL) {
807                 struct sk_filter *filter;
808
809                 memcpy(newsk, sk, sk->sk_prot->obj_size);
810
811                 /* SANITY */
812                 sk_node_init(&newsk->sk_node);
813                 sock_lock_init(newsk);
814                 bh_lock_sock(newsk);
815
816                 atomic_set(&newsk->sk_rmem_alloc, 0);
817                 atomic_set(&newsk->sk_wmem_alloc, 0);
818                 atomic_set(&newsk->sk_omem_alloc, 0);
819                 skb_queue_head_init(&newsk->sk_receive_queue);
820                 skb_queue_head_init(&newsk->sk_write_queue);
821
822                 rwlock_init(&newsk->sk_dst_lock);
823                 rwlock_init(&newsk->sk_callback_lock);
824
825                 newsk->sk_dst_cache     = NULL;
826                 newsk->sk_wmem_queued   = 0;
827                 newsk->sk_forward_alloc = 0;
828                 newsk->sk_send_head     = NULL;
829                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
830                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
831
832                 sock_reset_flag(newsk, SOCK_DONE);
833                 skb_queue_head_init(&newsk->sk_error_queue);
834
835                 filter = newsk->sk_filter;
836                 if (filter != NULL)
837                         sk_filter_charge(newsk, filter);
838
839                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
840                         /* It is still raw copy of parent, so invalidate
841                          * destructor and make plain sk_free() */
842                         newsk->sk_destruct = NULL;
843                         sk_free(newsk);
844                         newsk = NULL;
845                         goto out;
846                 }
847
848                 newsk->sk_err      = 0;
849                 newsk->sk_priority = 0;
850                 atomic_set(&newsk->sk_refcnt, 2);
851
852                 /*
853                  * Increment the counter in the same struct proto as the master
854                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
855                  * is the same as sk->sk_prot->socks, as this field was copied
856                  * with memcpy).
857                  *
858                  * This _changes_ the previous behaviour, where
859                  * tcp_create_openreq_child always was incrementing the
860                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
861                  * to be taken into account in all callers. -acme
862                  */
863                 sk_refcnt_debug_inc(newsk);
864                 newsk->sk_socket = NULL;
865                 newsk->sk_sleep  = NULL;
866
867                 if (newsk->sk_prot->sockets_allocated)
868                         atomic_inc(newsk->sk_prot->sockets_allocated);
869         }
870 out:
871         return newsk;
872 }
873
874 EXPORT_SYMBOL_GPL(sk_clone);
875
876 void __init sk_init(void)
877 {
878         if (num_physpages <= 4096) {
879                 sysctl_wmem_max = 32767;
880                 sysctl_rmem_max = 32767;
881                 sysctl_wmem_default = 32767;
882                 sysctl_rmem_default = 32767;
883         } else if (num_physpages >= 131072) {
884                 sysctl_wmem_max = 131071;
885                 sysctl_rmem_max = 131071;
886         }
887 }
888
889 /*
890  *      Simple resource managers for sockets.
891  */
892
893
894 /* 
895  * Write buffer destructor automatically called from kfree_skb. 
896  */
897 void sock_wfree(struct sk_buff *skb)
898 {
899         struct sock *sk = skb->sk;
900
901         /* In case it might be waiting for more memory. */
902         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
903         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
904                 sk->sk_write_space(sk);
905         sock_put(sk);
906 }
907
908 /* 
909  * Read buffer destructor automatically called from kfree_skb. 
910  */
911 void sock_rfree(struct sk_buff *skb)
912 {
913         struct sock *sk = skb->sk;
914
915         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
916 }
917
918
919 int sock_i_uid(struct sock *sk)
920 {
921         int uid;
922
923         read_lock(&sk->sk_callback_lock);
924         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
925         read_unlock(&sk->sk_callback_lock);
926         return uid;
927 }
928
929 unsigned long sock_i_ino(struct sock *sk)
930 {
931         unsigned long ino;
932
933         read_lock(&sk->sk_callback_lock);
934         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
935         read_unlock(&sk->sk_callback_lock);
936         return ino;
937 }
938
939 /*
940  * Allocate a skb from the socket's send buffer.
941  */
942 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
943                              gfp_t priority)
944 {
945         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
946                 struct sk_buff * skb = alloc_skb(size, priority);
947                 if (skb) {
948                         skb_set_owner_w(skb, sk);
949                         return skb;
950                 }
951         }
952         return NULL;
953 }
954
955 /*
956  * Allocate a skb from the socket's receive buffer.
957  */ 
958 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
959                              gfp_t priority)
960 {
961         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
962                 struct sk_buff *skb = alloc_skb(size, priority);
963                 if (skb) {
964                         skb_set_owner_r(skb, sk);
965                         return skb;
966                 }
967         }
968         return NULL;
969 }
970
971 /* 
972  * Allocate a memory block from the socket's option memory buffer.
973  */ 
974 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
975 {
976         if ((unsigned)size <= sysctl_optmem_max &&
977             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
978                 void *mem;
979                 /* First do the add, to avoid the race if kmalloc
980                  * might sleep.
981                  */
982                 atomic_add(size, &sk->sk_omem_alloc);
983                 mem = kmalloc(size, priority);
984                 if (mem)
985                         return mem;
986                 atomic_sub(size, &sk->sk_omem_alloc);
987         }
988         return NULL;
989 }
990
991 /*
992  * Free an option memory block.
993  */
994 void sock_kfree_s(struct sock *sk, void *mem, int size)
995 {
996         kfree(mem);
997         atomic_sub(size, &sk->sk_omem_alloc);
998 }
999
1000 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1001    I think, these locks should be removed for datagram sockets.
1002  */
1003 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1004 {
1005         DEFINE_WAIT(wait);
1006
1007         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1008         for (;;) {
1009                 if (!timeo)
1010                         break;
1011                 if (signal_pending(current))
1012                         break;
1013                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1014                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1015                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1016                         break;
1017                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1018                         break;
1019                 if (sk->sk_err)
1020                         break;
1021                 timeo = schedule_timeout(timeo);
1022         }
1023         finish_wait(sk->sk_sleep, &wait);
1024         return timeo;
1025 }
1026
1027
1028 /*
1029  *      Generic send/receive buffer handlers
1030  */
1031
1032 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1033                                             unsigned long header_len,
1034                                             unsigned long data_len,
1035                                             int noblock, int *errcode)
1036 {
1037         struct sk_buff *skb;
1038         gfp_t gfp_mask;
1039         long timeo;
1040         int err;
1041
1042         gfp_mask = sk->sk_allocation;
1043         if (gfp_mask & __GFP_WAIT)
1044                 gfp_mask |= __GFP_REPEAT;
1045
1046         timeo = sock_sndtimeo(sk, noblock);
1047         while (1) {
1048                 err = sock_error(sk);
1049                 if (err != 0)
1050                         goto failure;
1051
1052                 err = -EPIPE;
1053                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1054                         goto failure;
1055
1056                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1057                         skb = alloc_skb(header_len, sk->sk_allocation);
1058                         if (skb) {
1059                                 int npages;
1060                                 int i;
1061
1062                                 /* No pages, we're done... */
1063                                 if (!data_len)
1064                                         break;
1065
1066                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1067                                 skb->truesize += data_len;
1068                                 skb_shinfo(skb)->nr_frags = npages;
1069                                 for (i = 0; i < npages; i++) {
1070                                         struct page *page;
1071                                         skb_frag_t *frag;
1072
1073                                         page = alloc_pages(sk->sk_allocation, 0);
1074                                         if (!page) {
1075                                                 err = -ENOBUFS;
1076                                                 skb_shinfo(skb)->nr_frags = i;
1077                                                 kfree_skb(skb);
1078                                                 goto failure;
1079                                         }
1080
1081                                         frag = &skb_shinfo(skb)->frags[i];
1082                                         frag->page = page;
1083                                         frag->page_offset = 0;
1084                                         frag->size = (data_len >= PAGE_SIZE ?
1085                                                       PAGE_SIZE :
1086                                                       data_len);
1087                                         data_len -= PAGE_SIZE;
1088                                 }
1089
1090                                 /* Full success... */
1091                                 break;
1092                         }
1093                         err = -ENOBUFS;
1094                         goto failure;
1095                 }
1096                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1097                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1098                 err = -EAGAIN;
1099                 if (!timeo)
1100                         goto failure;
1101                 if (signal_pending(current))
1102                         goto interrupted;
1103                 timeo = sock_wait_for_wmem(sk, timeo);
1104         }
1105
1106         skb_set_owner_w(skb, sk);
1107         return skb;
1108
1109 interrupted:
1110         err = sock_intr_errno(timeo);
1111 failure:
1112         *errcode = err;
1113         return NULL;
1114 }
1115
1116 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1117                                     int noblock, int *errcode)
1118 {
1119         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1120 }
1121
1122 static void __lock_sock(struct sock *sk)
1123 {
1124         DEFINE_WAIT(wait);
1125
1126         for(;;) {
1127                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1128                                         TASK_UNINTERRUPTIBLE);
1129                 spin_unlock_bh(&sk->sk_lock.slock);
1130                 schedule();
1131                 spin_lock_bh(&sk->sk_lock.slock);
1132                 if(!sock_owned_by_user(sk))
1133                         break;
1134         }
1135         finish_wait(&sk->sk_lock.wq, &wait);
1136 }
1137
1138 static void __release_sock(struct sock *sk)
1139 {
1140         struct sk_buff *skb = sk->sk_backlog.head;
1141
1142         do {
1143                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1144                 bh_unlock_sock(sk);
1145
1146                 do {
1147                         struct sk_buff *next = skb->next;
1148
1149                         skb->next = NULL;
1150                         sk->sk_backlog_rcv(sk, skb);
1151
1152                         /*
1153                          * We are in process context here with softirqs
1154                          * disabled, use cond_resched_softirq() to preempt.
1155                          * This is safe to do because we've taken the backlog
1156                          * queue private:
1157                          */
1158                         cond_resched_softirq();
1159
1160                         skb = next;
1161                 } while (skb != NULL);
1162
1163                 bh_lock_sock(sk);
1164         } while((skb = sk->sk_backlog.head) != NULL);
1165 }
1166
1167 /**
1168  * sk_wait_data - wait for data to arrive at sk_receive_queue
1169  * @sk:    sock to wait on
1170  * @timeo: for how long
1171  *
1172  * Now socket state including sk->sk_err is changed only under lock,
1173  * hence we may omit checks after joining wait queue.
1174  * We check receive queue before schedule() only as optimization;
1175  * it is very likely that release_sock() added new data.
1176  */
1177 int sk_wait_data(struct sock *sk, long *timeo)
1178 {
1179         int rc;
1180         DEFINE_WAIT(wait);
1181
1182         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1183         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1184         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1185         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1186         finish_wait(sk->sk_sleep, &wait);
1187         return rc;
1188 }
1189
1190 EXPORT_SYMBOL(sk_wait_data);
1191
1192 /*
1193  * Set of default routines for initialising struct proto_ops when
1194  * the protocol does not support a particular function. In certain
1195  * cases where it makes no sense for a protocol to have a "do nothing"
1196  * function, some default processing is provided.
1197  */
1198
1199 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1200 {
1201         return -EOPNOTSUPP;
1202 }
1203
1204 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1205                     int len, int flags)
1206 {
1207         return -EOPNOTSUPP;
1208 }
1209
1210 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1211 {
1212         return -EOPNOTSUPP;
1213 }
1214
1215 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1216 {
1217         return -EOPNOTSUPP;
1218 }
1219
1220 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1221                     int *len, int peer)
1222 {
1223         return -EOPNOTSUPP;
1224 }
1225
1226 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1227 {
1228         return 0;
1229 }
1230
1231 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1232 {
1233         return -EOPNOTSUPP;
1234 }
1235
1236 int sock_no_listen(struct socket *sock, int backlog)
1237 {
1238         return -EOPNOTSUPP;
1239 }
1240
1241 int sock_no_shutdown(struct socket *sock, int how)
1242 {
1243         return -EOPNOTSUPP;
1244 }
1245
1246 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1247                     char __user *optval, int optlen)
1248 {
1249         return -EOPNOTSUPP;
1250 }
1251
1252 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1253                     char __user *optval, int __user *optlen)
1254 {
1255         return -EOPNOTSUPP;
1256 }
1257
1258 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1259                     size_t len)
1260 {
1261         return -EOPNOTSUPP;
1262 }
1263
1264 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1265                     size_t len, int flags)
1266 {
1267         return -EOPNOTSUPP;
1268 }
1269
1270 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1271 {
1272         /* Mirror missing mmap method error code */
1273         return -ENODEV;
1274 }
1275
1276 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1277 {
1278         ssize_t res;
1279         struct msghdr msg = {.msg_flags = flags};
1280         struct kvec iov;
1281         char *kaddr = kmap(page);
1282         iov.iov_base = kaddr + offset;
1283         iov.iov_len = size;
1284         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1285         kunmap(page);
1286         return res;
1287 }
1288
1289 /*
1290  *      Default Socket Callbacks
1291  */
1292
1293 static void sock_def_wakeup(struct sock *sk)
1294 {
1295         read_lock(&sk->sk_callback_lock);
1296         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1297                 wake_up_interruptible_all(sk->sk_sleep);
1298         read_unlock(&sk->sk_callback_lock);
1299 }
1300
1301 static void sock_def_error_report(struct sock *sk)
1302 {
1303         read_lock(&sk->sk_callback_lock);
1304         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1305                 wake_up_interruptible(sk->sk_sleep);
1306         sk_wake_async(sk,0,POLL_ERR); 
1307         read_unlock(&sk->sk_callback_lock);
1308 }
1309
1310 static void sock_def_readable(struct sock *sk, int len)
1311 {
1312         read_lock(&sk->sk_callback_lock);
1313         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1314                 wake_up_interruptible(sk->sk_sleep);
1315         sk_wake_async(sk,1,POLL_IN);
1316         read_unlock(&sk->sk_callback_lock);
1317 }
1318
1319 static void sock_def_write_space(struct sock *sk)
1320 {
1321         read_lock(&sk->sk_callback_lock);
1322
1323         /* Do not wake up a writer until he can make "significant"
1324          * progress.  --DaveM
1325          */
1326         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1327                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1328                         wake_up_interruptible(sk->sk_sleep);
1329
1330                 /* Should agree with poll, otherwise some programs break */
1331                 if (sock_writeable(sk))
1332                         sk_wake_async(sk, 2, POLL_OUT);
1333         }
1334
1335         read_unlock(&sk->sk_callback_lock);
1336 }
1337
1338 static void sock_def_destruct(struct sock *sk)
1339 {
1340         kfree(sk->sk_protinfo);
1341 }
1342
1343 void sk_send_sigurg(struct sock *sk)
1344 {
1345         if (sk->sk_socket && sk->sk_socket->file)
1346                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1347                         sk_wake_async(sk, 3, POLL_PRI);
1348 }
1349
1350 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1351                     unsigned long expires)
1352 {
1353         if (!mod_timer(timer, expires))
1354                 sock_hold(sk);
1355 }
1356
1357 EXPORT_SYMBOL(sk_reset_timer);
1358
1359 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1360 {
1361         if (timer_pending(timer) && del_timer(timer))
1362                 __sock_put(sk);
1363 }
1364
1365 EXPORT_SYMBOL(sk_stop_timer);
1366
1367 void sock_init_data(struct socket *sock, struct sock *sk)
1368 {
1369         skb_queue_head_init(&sk->sk_receive_queue);
1370         skb_queue_head_init(&sk->sk_write_queue);
1371         skb_queue_head_init(&sk->sk_error_queue);
1372
1373         sk->sk_send_head        =       NULL;
1374
1375         init_timer(&sk->sk_timer);
1376         
1377         sk->sk_allocation       =       GFP_KERNEL;
1378         sk->sk_rcvbuf           =       sysctl_rmem_default;
1379         sk->sk_sndbuf           =       sysctl_wmem_default;
1380         sk->sk_state            =       TCP_CLOSE;
1381         sk->sk_socket           =       sock;
1382
1383         sock_set_flag(sk, SOCK_ZAPPED);
1384
1385         if(sock)
1386         {
1387                 sk->sk_type     =       sock->type;
1388                 sk->sk_sleep    =       &sock->wait;
1389                 sock->sk        =       sk;
1390         } else
1391                 sk->sk_sleep    =       NULL;
1392
1393         rwlock_init(&sk->sk_dst_lock);
1394         rwlock_init(&sk->sk_callback_lock);
1395
1396         sk->sk_state_change     =       sock_def_wakeup;
1397         sk->sk_data_ready       =       sock_def_readable;
1398         sk->sk_write_space      =       sock_def_write_space;
1399         sk->sk_error_report     =       sock_def_error_report;
1400         sk->sk_destruct         =       sock_def_destruct;
1401
1402         sk->sk_sndmsg_page      =       NULL;
1403         sk->sk_sndmsg_off       =       0;
1404
1405         sk->sk_peercred.pid     =       0;
1406         sk->sk_peercred.uid     =       -1;
1407         sk->sk_peercred.gid     =       -1;
1408         sk->sk_write_pending    =       0;
1409         sk->sk_rcvlowat         =       1;
1410         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1411         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1412
1413         sk->sk_stamp.tv_sec     = -1L;
1414         sk->sk_stamp.tv_usec    = -1L;
1415
1416         atomic_set(&sk->sk_refcnt, 1);
1417 }
1418
1419 void fastcall lock_sock(struct sock *sk)
1420 {
1421         might_sleep();
1422         spin_lock_bh(&(sk->sk_lock.slock));
1423         if (sk->sk_lock.owner)
1424                 __lock_sock(sk);
1425         sk->sk_lock.owner = (void *)1;
1426         spin_unlock_bh(&(sk->sk_lock.slock));
1427 }
1428
1429 EXPORT_SYMBOL(lock_sock);
1430
1431 void fastcall release_sock(struct sock *sk)
1432 {
1433         spin_lock_bh(&(sk->sk_lock.slock));
1434         if (sk->sk_backlog.tail)
1435                 __release_sock(sk);
1436         sk->sk_lock.owner = NULL;
1437         if (waitqueue_active(&(sk->sk_lock.wq)))
1438                 wake_up(&(sk->sk_lock.wq));
1439         spin_unlock_bh(&(sk->sk_lock.slock));
1440 }
1441 EXPORT_SYMBOL(release_sock);
1442
1443 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1444
1445         if (!sock_flag(sk, SOCK_TIMESTAMP))
1446                 sock_enable_timestamp(sk);
1447         if (sk->sk_stamp.tv_sec == -1) 
1448                 return -ENOENT;
1449         if (sk->sk_stamp.tv_sec == 0)
1450                 do_gettimeofday(&sk->sk_stamp);
1451         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1452                 -EFAULT : 0; 
1453
1454 EXPORT_SYMBOL(sock_get_timestamp);
1455
1456 void sock_enable_timestamp(struct sock *sk)
1457 {       
1458         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1459                 sock_set_flag(sk, SOCK_TIMESTAMP);
1460                 net_enable_timestamp();
1461         }
1462 }
1463 EXPORT_SYMBOL(sock_enable_timestamp); 
1464
1465 /*
1466  *      Get a socket option on an socket.
1467  *
1468  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1469  *      asynchronous errors should be reported by getsockopt. We assume
1470  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1471  */
1472 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1473                            char __user *optval, int __user *optlen)
1474 {
1475         struct sock *sk = sock->sk;
1476
1477         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1478 }
1479
1480 EXPORT_SYMBOL(sock_common_getsockopt);
1481
1482 #ifdef CONFIG_COMPAT
1483 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1484                                   char __user *optval, int __user *optlen)
1485 {
1486         struct sock *sk = sock->sk;
1487
1488         if (sk->sk_prot->compat_setsockopt != NULL)
1489                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1490                                                       optval, optlen);
1491         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1492 }
1493 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1494 #endif
1495
1496 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1497                         struct msghdr *msg, size_t size, int flags)
1498 {
1499         struct sock *sk = sock->sk;
1500         int addr_len = 0;
1501         int err;
1502
1503         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1504                                    flags & ~MSG_DONTWAIT, &addr_len);
1505         if (err >= 0)
1506                 msg->msg_namelen = addr_len;
1507         return err;
1508 }
1509
1510 EXPORT_SYMBOL(sock_common_recvmsg);
1511
1512 /*
1513  *      Set socket options on an inet socket.
1514  */
1515 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1516                            char __user *optval, int optlen)
1517 {
1518         struct sock *sk = sock->sk;
1519
1520         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1521 }
1522
1523 EXPORT_SYMBOL(sock_common_setsockopt);
1524
1525 #ifdef CONFIG_COMPAT
1526 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1527                                   char __user *optval, int optlen)
1528 {
1529         struct sock *sk = sock->sk;
1530
1531         if (sk->sk_prot->compat_setsockopt != NULL)
1532                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1533                                                       optval, optlen);
1534         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1535 }
1536 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1537 #endif
1538
1539 void sk_common_release(struct sock *sk)
1540 {
1541         if (sk->sk_prot->destroy)
1542                 sk->sk_prot->destroy(sk);
1543
1544         /*
1545          * Observation: when sock_common_release is called, processes have
1546          * no access to socket. But net still has.
1547          * Step one, detach it from networking:
1548          *
1549          * A. Remove from hash tables.
1550          */
1551
1552         sk->sk_prot->unhash(sk);
1553
1554         /*
1555          * In this point socket cannot receive new packets, but it is possible
1556          * that some packets are in flight because some CPU runs receiver and
1557          * did hash table lookup before we unhashed socket. They will achieve
1558          * receive queue and will be purged by socket destructor.
1559          *
1560          * Also we still have packets pending on receive queue and probably,
1561          * our own packets waiting in device queues. sock_destroy will drain
1562          * receive queue, but transmitted packets will delay socket destruction
1563          * until the last reference will be released.
1564          */
1565
1566         sock_orphan(sk);
1567
1568         xfrm_sk_free_policy(sk);
1569
1570         sk_refcnt_debug_release(sk);
1571         sock_put(sk);
1572 }
1573
1574 EXPORT_SYMBOL(sk_common_release);
1575
1576 static DEFINE_RWLOCK(proto_list_lock);
1577 static LIST_HEAD(proto_list);
1578
1579 int proto_register(struct proto *prot, int alloc_slab)
1580 {
1581         char *request_sock_slab_name = NULL;
1582         char *timewait_sock_slab_name;
1583         int rc = -ENOBUFS;
1584
1585         if (alloc_slab) {
1586                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1587                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1588
1589                 if (prot->slab == NULL) {
1590                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1591                                prot->name);
1592                         goto out;
1593                 }
1594
1595                 if (prot->rsk_prot != NULL) {
1596                         static const char mask[] = "request_sock_%s";
1597
1598                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1599                         if (request_sock_slab_name == NULL)
1600                                 goto out_free_sock_slab;
1601
1602                         sprintf(request_sock_slab_name, mask, prot->name);
1603                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1604                                                                  prot->rsk_prot->obj_size, 0,
1605                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1606
1607                         if (prot->rsk_prot->slab == NULL) {
1608                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1609                                        prot->name);
1610                                 goto out_free_request_sock_slab_name;
1611                         }
1612                 }
1613
1614                 if (prot->twsk_prot != NULL) {
1615                         static const char mask[] = "tw_sock_%s";
1616
1617                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1618
1619                         if (timewait_sock_slab_name == NULL)
1620                                 goto out_free_request_sock_slab;
1621
1622                         sprintf(timewait_sock_slab_name, mask, prot->name);
1623                         prot->twsk_prot->twsk_slab =
1624                                 kmem_cache_create(timewait_sock_slab_name,
1625                                                   prot->twsk_prot->twsk_obj_size,
1626                                                   0, SLAB_HWCACHE_ALIGN,
1627                                                   NULL, NULL);
1628                         if (prot->twsk_prot->twsk_slab == NULL)
1629                                 goto out_free_timewait_sock_slab_name;
1630                 }
1631         }
1632
1633         write_lock(&proto_list_lock);
1634         list_add(&prot->node, &proto_list);
1635         write_unlock(&proto_list_lock);
1636         rc = 0;
1637 out:
1638         return rc;
1639 out_free_timewait_sock_slab_name:
1640         kfree(timewait_sock_slab_name);
1641 out_free_request_sock_slab:
1642         if (prot->rsk_prot && prot->rsk_prot->slab) {
1643                 kmem_cache_destroy(prot->rsk_prot->slab);
1644                 prot->rsk_prot->slab = NULL;
1645         }
1646 out_free_request_sock_slab_name:
1647         kfree(request_sock_slab_name);
1648 out_free_sock_slab:
1649         kmem_cache_destroy(prot->slab);
1650         prot->slab = NULL;
1651         goto out;
1652 }
1653
1654 EXPORT_SYMBOL(proto_register);
1655
1656 void proto_unregister(struct proto *prot)
1657 {
1658         write_lock(&proto_list_lock);
1659         list_del(&prot->node);
1660         write_unlock(&proto_list_lock);
1661
1662         if (prot->slab != NULL) {
1663                 kmem_cache_destroy(prot->slab);
1664                 prot->slab = NULL;
1665         }
1666
1667         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1668                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1669
1670                 kmem_cache_destroy(prot->rsk_prot->slab);
1671                 kfree(name);
1672                 prot->rsk_prot->slab = NULL;
1673         }
1674
1675         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1676                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1677
1678                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1679                 kfree(name);
1680                 prot->twsk_prot->twsk_slab = NULL;
1681         }
1682 }
1683
1684 EXPORT_SYMBOL(proto_unregister);
1685
1686 #ifdef CONFIG_PROC_FS
1687 static inline struct proto *__proto_head(void)
1688 {
1689         return list_entry(proto_list.next, struct proto, node);
1690 }
1691
1692 static inline struct proto *proto_head(void)
1693 {
1694         return list_empty(&proto_list) ? NULL : __proto_head();
1695 }
1696
1697 static inline struct proto *proto_next(struct proto *proto)
1698 {
1699         return proto->node.next == &proto_list ? NULL :
1700                 list_entry(proto->node.next, struct proto, node);
1701 }
1702
1703 static inline struct proto *proto_get_idx(loff_t pos)
1704 {
1705         struct proto *proto;
1706         loff_t i = 0;
1707
1708         list_for_each_entry(proto, &proto_list, node)
1709                 if (i++ == pos)
1710                         goto out;
1711
1712         proto = NULL;
1713 out:
1714         return proto;
1715 }
1716
1717 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1718 {
1719         read_lock(&proto_list_lock);
1720         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1721 }
1722
1723 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1724 {
1725         ++*pos;
1726         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1727 }
1728
1729 static void proto_seq_stop(struct seq_file *seq, void *v)
1730 {
1731         read_unlock(&proto_list_lock);
1732 }
1733
1734 static char proto_method_implemented(const void *method)
1735 {
1736         return method == NULL ? 'n' : 'y';
1737 }
1738
1739 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1740 {
1741         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1742                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1743                    proto->name,
1744                    proto->obj_size,
1745                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1746                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1747                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1748                    proto->max_header,
1749                    proto->slab == NULL ? "no" : "yes",
1750                    module_name(proto->owner),
1751                    proto_method_implemented(proto->close),
1752                    proto_method_implemented(proto->connect),
1753                    proto_method_implemented(proto->disconnect),
1754                    proto_method_implemented(proto->accept),
1755                    proto_method_implemented(proto->ioctl),
1756                    proto_method_implemented(proto->init),
1757                    proto_method_implemented(proto->destroy),
1758                    proto_method_implemented(proto->shutdown),
1759                    proto_method_implemented(proto->setsockopt),
1760                    proto_method_implemented(proto->getsockopt),
1761                    proto_method_implemented(proto->sendmsg),
1762                    proto_method_implemented(proto->recvmsg),
1763                    proto_method_implemented(proto->sendpage),
1764                    proto_method_implemented(proto->bind),
1765                    proto_method_implemented(proto->backlog_rcv),
1766                    proto_method_implemented(proto->hash),
1767                    proto_method_implemented(proto->unhash),
1768                    proto_method_implemented(proto->get_port),
1769                    proto_method_implemented(proto->enter_memory_pressure));
1770 }
1771
1772 static int proto_seq_show(struct seq_file *seq, void *v)
1773 {
1774         if (v == SEQ_START_TOKEN)
1775                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1776                            "protocol",
1777                            "size",
1778                            "sockets",
1779                            "memory",
1780                            "press",
1781                            "maxhdr",
1782                            "slab",
1783                            "module",
1784                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1785         else
1786                 proto_seq_printf(seq, v);
1787         return 0;
1788 }
1789
1790 static struct seq_operations proto_seq_ops = {
1791         .start  = proto_seq_start,
1792         .next   = proto_seq_next,
1793         .stop   = proto_seq_stop,
1794         .show   = proto_seq_show,
1795 };
1796
1797 static int proto_seq_open(struct inode *inode, struct file *file)
1798 {
1799         return seq_open(file, &proto_seq_ops);
1800 }
1801
1802 static struct file_operations proto_seq_fops = {
1803         .owner          = THIS_MODULE,
1804         .open           = proto_seq_open,
1805         .read           = seq_read,
1806         .llseek         = seq_lseek,
1807         .release        = seq_release,
1808 };
1809
1810 static int __init proto_init(void)
1811 {
1812         /* register /proc/net/protocols */
1813         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1814 }
1815
1816 subsys_initcall(proto_init);
1817
1818 #endif /* PROC_FS */
1819
1820 EXPORT_SYMBOL(sk_alloc);
1821 EXPORT_SYMBOL(sk_free);
1822 EXPORT_SYMBOL(sk_send_sigurg);
1823 EXPORT_SYMBOL(sock_alloc_send_skb);
1824 EXPORT_SYMBOL(sock_init_data);
1825 EXPORT_SYMBOL(sock_kfree_s);
1826 EXPORT_SYMBOL(sock_kmalloc);
1827 EXPORT_SYMBOL(sock_no_accept);
1828 EXPORT_SYMBOL(sock_no_bind);
1829 EXPORT_SYMBOL(sock_no_connect);
1830 EXPORT_SYMBOL(sock_no_getname);
1831 EXPORT_SYMBOL(sock_no_getsockopt);
1832 EXPORT_SYMBOL(sock_no_ioctl);
1833 EXPORT_SYMBOL(sock_no_listen);
1834 EXPORT_SYMBOL(sock_no_mmap);
1835 EXPORT_SYMBOL(sock_no_poll);
1836 EXPORT_SYMBOL(sock_no_recvmsg);
1837 EXPORT_SYMBOL(sock_no_sendmsg);
1838 EXPORT_SYMBOL(sock_no_sendpage);
1839 EXPORT_SYMBOL(sock_no_setsockopt);
1840 EXPORT_SYMBOL(sock_no_shutdown);
1841 EXPORT_SYMBOL(sock_no_socketpair);
1842 EXPORT_SYMBOL(sock_rfree);
1843 EXPORT_SYMBOL(sock_setsockopt);
1844 EXPORT_SYMBOL(sock_wfree);
1845 EXPORT_SYMBOL(sock_wmalloc);
1846 EXPORT_SYMBOL(sock_i_uid);
1847 EXPORT_SYMBOL(sock_i_ino);
1848 EXPORT_SYMBOL(sysctl_optmem_max);
1849 #ifdef CONFIG_SYSCTL
1850 EXPORT_SYMBOL(sysctl_rmem_max);
1851 EXPORT_SYMBOL(sysctl_wmem_max);
1852 #endif