Pull bugzilla-5653 into release branch
[linux-2.6] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:     Ross Biro
13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *              Florian La Roche, <flla@stud.uni-sb.de>
15  *              Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *              Alan Cox        :       Numerous verify_area() problems
19  *              Alan Cox        :       Connecting on a connecting socket
20  *                                      now returns an error for tcp.
21  *              Alan Cox        :       sock->protocol is set correctly.
22  *                                      and is not sometimes left as 0.
23  *              Alan Cox        :       connect handles icmp errors on a
24  *                                      connect properly. Unfortunately there
25  *                                      is a restart syscall nasty there. I
26  *                                      can't match BSD without hacking the C
27  *                                      library. Ideas urgently sought!
28  *              Alan Cox        :       Disallow bind() to addresses that are
29  *                                      not ours - especially broadcast ones!!
30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
32  *                                      instead they leave that for the DESTROY timer.
33  *              Alan Cox        :       Clean up error flag in accept
34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
35  *                                      was buggy. Put a remove_sock() in the handler
36  *                                      for memory when we hit 0. Also altered the timer
37  *                                      code. The ACK stuff can wait and needs major 
38  *                                      TCP layer surgery.
39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
40  *                                      and fixed timer/inet_bh race.
41  *              Alan Cox        :       Added zapped flag for TCP
42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
49  *      Pauline Middelink       :       identd support
50  *              Alan Cox        :       Fixed connect() taking signals I think.
51  *              Alan Cox        :       SO_LINGER supported
52  *              Alan Cox        :       Error reporting fixes
53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
54  *              Alan Cox        :       inet sockets don't set sk->type!
55  *              Alan Cox        :       Split socket option code
56  *              Alan Cox        :       Callbacks
57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
58  *              Alex            :       Removed restriction on inet fioctl
59  *              Alan Cox        :       Splitting INET from NET core
60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
62  *              Alan Cox        :       Split IP from generic code
63  *              Alan Cox        :       New kfree_skbmem()
64  *              Alan Cox        :       Make SO_DEBUG superuser only.
65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
66  *                                      (compatibility fix)
67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
68  *              Alan Cox        :       Allocator for a socket is settable.
69  *              Alan Cox        :       SO_ERROR includes soft errors.
70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
71  *              Alan Cox        :       Generic socket allocation to make hooks
72  *                                      easier (suggested by Craig Metz).
73  *              Michael Pall    :       SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
81  *              Andi Kleen      :       Fix write_space callback
82  *              Chris Evans     :       Security fixes - signedness again
83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *              This program is free software; you can redistribute it and/or
89  *              modify it under the terms of the GNU General Public License
90  *              as published by the Free Software Foundation; either version
91  *              2 of the License, or (at your option) any later version.
92  */
93
94 #include <linux/capability.h>
95 #include <linux/config.h>
96 #include <linux/errno.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/request_sock.h>
123 #include <net/sock.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /* Take into consideration the size of the struct sk_buff overhead in the
134  * determination of these values, since that is non-constant across
135  * platforms.  This makes socket queueing behavior and performance
136  * not depend upon such differences.
137  */
138 #define _SK_MEM_PACKETS         256
139 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
140 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
142
143 /* Run time adjustable parameters. */
144 __u32 sysctl_wmem_max = SK_WMEM_MAX;
145 __u32 sysctl_rmem_max = SK_RMEM_MAX;
146 __u32 sysctl_wmem_default = SK_WMEM_MAX;
147 __u32 sysctl_rmem_default = SK_RMEM_MAX;
148
149 /* Maximal space eaten by iovec or ancilliary data plus some space */
150 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
151
152 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
153 {
154         struct timeval tv;
155
156         if (optlen < sizeof(tv))
157                 return -EINVAL;
158         if (copy_from_user(&tv, optval, sizeof(tv)))
159                 return -EFAULT;
160
161         *timeo_p = MAX_SCHEDULE_TIMEOUT;
162         if (tv.tv_sec == 0 && tv.tv_usec == 0)
163                 return 0;
164         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
165                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
166         return 0;
167 }
168
169 static void sock_warn_obsolete_bsdism(const char *name)
170 {
171         static int warned;
172         static char warncomm[TASK_COMM_LEN];
173         if (strcmp(warncomm, current->comm) && warned < 5) { 
174                 strcpy(warncomm,  current->comm); 
175                 printk(KERN_WARNING "process `%s' is using obsolete "
176                        "%s SO_BSDCOMPAT\n", warncomm, name);
177                 warned++;
178         }
179 }
180
181 static void sock_disable_timestamp(struct sock *sk)
182 {       
183         if (sock_flag(sk, SOCK_TIMESTAMP)) { 
184                 sock_reset_flag(sk, SOCK_TIMESTAMP);
185                 net_disable_timestamp();
186         }
187 }
188
189
190 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
191 {
192         int err = 0;
193         int skb_len;
194
195         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
196            number of warnings when compiling with -W --ANK
197          */
198         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
199             (unsigned)sk->sk_rcvbuf) {
200                 err = -ENOMEM;
201                 goto out;
202         }
203
204         /* It would be deadlock, if sock_queue_rcv_skb is used
205            with socket lock! We assume that users of this
206            function are lock free.
207         */
208         err = sk_filter(sk, skb, 1);
209         if (err)
210                 goto out;
211
212         skb->dev = NULL;
213         skb_set_owner_r(skb, sk);
214
215         /* Cache the SKB length before we tack it onto the receive
216          * queue.  Once it is added it no longer belongs to us and
217          * may be freed by other threads of control pulling packets
218          * from the queue.
219          */
220         skb_len = skb->len;
221
222         skb_queue_tail(&sk->sk_receive_queue, skb);
223
224         if (!sock_flag(sk, SOCK_DEAD))
225                 sk->sk_data_ready(sk, skb_len);
226 out:
227         return err;
228 }
229 EXPORT_SYMBOL(sock_queue_rcv_skb);
230
231 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
232 {
233         int rc = NET_RX_SUCCESS;
234
235         if (sk_filter(sk, skb, 0))
236                 goto discard_and_relse;
237
238         skb->dev = NULL;
239
240         bh_lock_sock(sk);
241         if (!sock_owned_by_user(sk))
242                 rc = sk->sk_backlog_rcv(sk, skb);
243         else
244                 sk_add_backlog(sk, skb);
245         bh_unlock_sock(sk);
246 out:
247         sock_put(sk);
248         return rc;
249 discard_and_relse:
250         kfree_skb(skb);
251         goto out;
252 }
253 EXPORT_SYMBOL(sk_receive_skb);
254
255 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
256 {
257         struct dst_entry *dst = sk->sk_dst_cache;
258
259         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
260                 sk->sk_dst_cache = NULL;
261                 dst_release(dst);
262                 return NULL;
263         }
264
265         return dst;
266 }
267 EXPORT_SYMBOL(__sk_dst_check);
268
269 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
270 {
271         struct dst_entry *dst = sk_dst_get(sk);
272
273         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
274                 sk_dst_reset(sk);
275                 dst_release(dst);
276                 return NULL;
277         }
278
279         return dst;
280 }
281 EXPORT_SYMBOL(sk_dst_check);
282
283 /*
284  *      This is meant for all protocols to use and covers goings on
285  *      at the socket level. Everything here is generic.
286  */
287
288 int sock_setsockopt(struct socket *sock, int level, int optname,
289                     char __user *optval, int optlen)
290 {
291         struct sock *sk=sock->sk;
292         struct sk_filter *filter;
293         int val;
294         int valbool;
295         struct linger ling;
296         int ret = 0;
297         
298         /*
299          *      Options without arguments
300          */
301
302 #ifdef SO_DONTLINGER            /* Compatibility item... */
303         if (optname == SO_DONTLINGER) {
304                 lock_sock(sk);
305                 sock_reset_flag(sk, SOCK_LINGER);
306                 release_sock(sk);
307                 return 0;
308         }
309 #endif
310         
311         if(optlen<sizeof(int))
312                 return(-EINVAL);
313         
314         if (get_user(val, (int __user *)optval))
315                 return -EFAULT;
316         
317         valbool = val?1:0;
318
319         lock_sock(sk);
320
321         switch(optname) 
322         {
323                 case SO_DEBUG:  
324                         if(val && !capable(CAP_NET_ADMIN))
325                         {
326                                 ret = -EACCES;
327                         }
328                         else if (valbool)
329                                 sock_set_flag(sk, SOCK_DBG);
330                         else
331                                 sock_reset_flag(sk, SOCK_DBG);
332                         break;
333                 case SO_REUSEADDR:
334                         sk->sk_reuse = valbool;
335                         break;
336                 case SO_TYPE:
337                 case SO_ERROR:
338                         ret = -ENOPROTOOPT;
339                         break;
340                 case SO_DONTROUTE:
341                         if (valbool)
342                                 sock_set_flag(sk, SOCK_LOCALROUTE);
343                         else
344                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
345                         break;
346                 case SO_BROADCAST:
347                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
348                         break;
349                 case SO_SNDBUF:
350                         /* Don't error on this BSD doesn't and if you think
351                            about it this is right. Otherwise apps have to
352                            play 'guess the biggest size' games. RCVBUF/SNDBUF
353                            are treated in BSD as hints */
354                            
355                         if (val > sysctl_wmem_max)
356                                 val = sysctl_wmem_max;
357 set_sndbuf:
358                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
359                         if ((val * 2) < SOCK_MIN_SNDBUF)
360                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
361                         else
362                                 sk->sk_sndbuf = val * 2;
363
364                         /*
365                          *      Wake up sending tasks if we
366                          *      upped the value.
367                          */
368                         sk->sk_write_space(sk);
369                         break;
370
371                 case SO_SNDBUFFORCE:
372                         if (!capable(CAP_NET_ADMIN)) {
373                                 ret = -EPERM;
374                                 break;
375                         }
376                         goto set_sndbuf;
377
378                 case SO_RCVBUF:
379                         /* Don't error on this BSD doesn't and if you think
380                            about it this is right. Otherwise apps have to
381                            play 'guess the biggest size' games. RCVBUF/SNDBUF
382                            are treated in BSD as hints */
383                           
384                         if (val > sysctl_rmem_max)
385                                 val = sysctl_rmem_max;
386 set_rcvbuf:
387                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
388                         /*
389                          * We double it on the way in to account for
390                          * "struct sk_buff" etc. overhead.   Applications
391                          * assume that the SO_RCVBUF setting they make will
392                          * allow that much actual data to be received on that
393                          * socket.
394                          *
395                          * Applications are unaware that "struct sk_buff" and
396                          * other overheads allocate from the receive buffer
397                          * during socket buffer allocation.
398                          *
399                          * And after considering the possible alternatives,
400                          * returning the value we actually used in getsockopt
401                          * is the most desirable behavior.
402                          */
403                         if ((val * 2) < SOCK_MIN_RCVBUF)
404                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
405                         else
406                                 sk->sk_rcvbuf = val * 2;
407                         break;
408
409                 case SO_RCVBUFFORCE:
410                         if (!capable(CAP_NET_ADMIN)) {
411                                 ret = -EPERM;
412                                 break;
413                         }
414                         goto set_rcvbuf;
415
416                 case SO_KEEPALIVE:
417 #ifdef CONFIG_INET
418                         if (sk->sk_protocol == IPPROTO_TCP)
419                                 tcp_set_keepalive(sk, valbool);
420 #endif
421                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
422                         break;
423
424                 case SO_OOBINLINE:
425                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
426                         break;
427
428                 case SO_NO_CHECK:
429                         sk->sk_no_check = valbool;
430                         break;
431
432                 case SO_PRIORITY:
433                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
434                                 sk->sk_priority = val;
435                         else
436                                 ret = -EPERM;
437                         break;
438
439                 case SO_LINGER:
440                         if(optlen<sizeof(ling)) {
441                                 ret = -EINVAL;  /* 1003.1g */
442                                 break;
443                         }
444                         if (copy_from_user(&ling,optval,sizeof(ling))) {
445                                 ret = -EFAULT;
446                                 break;
447                         }
448                         if (!ling.l_onoff)
449                                 sock_reset_flag(sk, SOCK_LINGER);
450                         else {
451 #if (BITS_PER_LONG == 32)
452                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
453                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
454                                 else
455 #endif
456                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
457                                 sock_set_flag(sk, SOCK_LINGER);
458                         }
459                         break;
460
461                 case SO_BSDCOMPAT:
462                         sock_warn_obsolete_bsdism("setsockopt");
463                         break;
464
465                 case SO_PASSCRED:
466                         if (valbool)
467                                 set_bit(SOCK_PASSCRED, &sock->flags);
468                         else
469                                 clear_bit(SOCK_PASSCRED, &sock->flags);
470                         break;
471
472                 case SO_TIMESTAMP:
473                         if (valbool)  {
474                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
475                                 sock_enable_timestamp(sk);
476                         } else
477                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
478                         break;
479
480                 case SO_RCVLOWAT:
481                         if (val < 0)
482                                 val = INT_MAX;
483                         sk->sk_rcvlowat = val ? : 1;
484                         break;
485
486                 case SO_RCVTIMEO:
487                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
488                         break;
489
490                 case SO_SNDTIMEO:
491                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
492                         break;
493
494 #ifdef CONFIG_NETDEVICES
495                 case SO_BINDTODEVICE:
496                 {
497                         char devname[IFNAMSIZ]; 
498
499                         /* Sorry... */ 
500                         if (!capable(CAP_NET_RAW)) {
501                                 ret = -EPERM;
502                                 break;
503                         }
504
505                         /* Bind this socket to a particular device like "eth0",
506                          * as specified in the passed interface name. If the
507                          * name is "" or the option length is zero the socket 
508                          * is not bound. 
509                          */ 
510
511                         if (!valbool) {
512                                 sk->sk_bound_dev_if = 0;
513                         } else {
514                                 if (optlen > IFNAMSIZ - 1)
515                                         optlen = IFNAMSIZ - 1;
516                                 memset(devname, 0, sizeof(devname));
517                                 if (copy_from_user(devname, optval, optlen)) {
518                                         ret = -EFAULT;
519                                         break;
520                                 }
521
522                                 /* Remove any cached route for this socket. */
523                                 sk_dst_reset(sk);
524
525                                 if (devname[0] == '\0') {
526                                         sk->sk_bound_dev_if = 0;
527                                 } else {
528                                         struct net_device *dev = dev_get_by_name(devname);
529                                         if (!dev) {
530                                                 ret = -ENODEV;
531                                                 break;
532                                         }
533                                         sk->sk_bound_dev_if = dev->ifindex;
534                                         dev_put(dev);
535                                 }
536                         }
537                         break;
538                 }
539 #endif
540
541
542                 case SO_ATTACH_FILTER:
543                         ret = -EINVAL;
544                         if (optlen == sizeof(struct sock_fprog)) {
545                                 struct sock_fprog fprog;
546
547                                 ret = -EFAULT;
548                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
549                                         break;
550
551                                 ret = sk_attach_filter(&fprog, sk);
552                         }
553                         break;
554
555                 case SO_DETACH_FILTER:
556                         spin_lock_bh(&sk->sk_lock.slock);
557                         filter = sk->sk_filter;
558                         if (filter) {
559                                 sk->sk_filter = NULL;
560                                 spin_unlock_bh(&sk->sk_lock.slock);
561                                 sk_filter_release(sk, filter);
562                                 break;
563                         }
564                         spin_unlock_bh(&sk->sk_lock.slock);
565                         ret = -ENONET;
566                         break;
567
568                 /* We implement the SO_SNDLOWAT etc to
569                    not be settable (1003.1g 5.3) */
570                 default:
571                         ret = -ENOPROTOOPT;
572                         break;
573         }
574         release_sock(sk);
575         return ret;
576 }
577
578
579 int sock_getsockopt(struct socket *sock, int level, int optname,
580                     char __user *optval, int __user *optlen)
581 {
582         struct sock *sk = sock->sk;
583         
584         union
585         {
586                 int val;
587                 struct linger ling;
588                 struct timeval tm;
589         } v;
590         
591         unsigned int lv = sizeof(int);
592         int len;
593         
594         if(get_user(len,optlen))
595                 return -EFAULT;
596         if(len < 0)
597                 return -EINVAL;
598                 
599         switch(optname) 
600         {
601                 case SO_DEBUG:          
602                         v.val = sock_flag(sk, SOCK_DBG);
603                         break;
604                 
605                 case SO_DONTROUTE:
606                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
607                         break;
608                 
609                 case SO_BROADCAST:
610                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
611                         break;
612
613                 case SO_SNDBUF:
614                         v.val = sk->sk_sndbuf;
615                         break;
616                 
617                 case SO_RCVBUF:
618                         v.val = sk->sk_rcvbuf;
619                         break;
620
621                 case SO_REUSEADDR:
622                         v.val = sk->sk_reuse;
623                         break;
624
625                 case SO_KEEPALIVE:
626                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
627                         break;
628
629                 case SO_TYPE:
630                         v.val = sk->sk_type;                            
631                         break;
632
633                 case SO_ERROR:
634                         v.val = -sock_error(sk);
635                         if(v.val==0)
636                                 v.val = xchg(&sk->sk_err_soft, 0);
637                         break;
638
639                 case SO_OOBINLINE:
640                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
641                         break;
642         
643                 case SO_NO_CHECK:
644                         v.val = sk->sk_no_check;
645                         break;
646
647                 case SO_PRIORITY:
648                         v.val = sk->sk_priority;
649                         break;
650                 
651                 case SO_LINGER: 
652                         lv              = sizeof(v.ling);
653                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
654                         v.ling.l_linger = sk->sk_lingertime / HZ;
655                         break;
656                                         
657                 case SO_BSDCOMPAT:
658                         sock_warn_obsolete_bsdism("getsockopt");
659                         break;
660
661                 case SO_TIMESTAMP:
662                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
663                         break;
664
665                 case SO_RCVTIMEO:
666                         lv=sizeof(struct timeval);
667                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
668                                 v.tm.tv_sec = 0;
669                                 v.tm.tv_usec = 0;
670                         } else {
671                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
672                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
673                         }
674                         break;
675
676                 case SO_SNDTIMEO:
677                         lv=sizeof(struct timeval);
678                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
679                                 v.tm.tv_sec = 0;
680                                 v.tm.tv_usec = 0;
681                         } else {
682                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
683                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
684                         }
685                         break;
686
687                 case SO_RCVLOWAT:
688                         v.val = sk->sk_rcvlowat;
689                         break;
690
691                 case SO_SNDLOWAT:
692                         v.val=1;
693                         break; 
694
695                 case SO_PASSCRED:
696                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
697                         break;
698
699                 case SO_PEERCRED:
700                         if (len > sizeof(sk->sk_peercred))
701                                 len = sizeof(sk->sk_peercred);
702                         if (copy_to_user(optval, &sk->sk_peercred, len))
703                                 return -EFAULT;
704                         goto lenout;
705
706                 case SO_PEERNAME:
707                 {
708                         char address[128];
709
710                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
711                                 return -ENOTCONN;
712                         if (lv < len)
713                                 return -EINVAL;
714                         if (copy_to_user(optval, address, len))
715                                 return -EFAULT;
716                         goto lenout;
717                 }
718
719                 /* Dubious BSD thing... Probably nobody even uses it, but
720                  * the UNIX standard wants it for whatever reason... -DaveM
721                  */
722                 case SO_ACCEPTCONN:
723                         v.val = sk->sk_state == TCP_LISTEN;
724                         break;
725
726                 case SO_PEERSEC:
727                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
728
729                 default:
730                         return(-ENOPROTOOPT);
731         }
732         if (len > lv)
733                 len = lv;
734         if (copy_to_user(optval, &v, len))
735                 return -EFAULT;
736 lenout:
737         if (put_user(len, optlen))
738                 return -EFAULT;
739         return 0;
740 }
741
742 /**
743  *      sk_alloc - All socket objects are allocated here
744  *      @family: protocol family
745  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
746  *      @prot: struct proto associated with this new sock instance
747  *      @zero_it: if we should zero the newly allocated sock
748  */
749 struct sock *sk_alloc(int family, gfp_t priority,
750                       struct proto *prot, int zero_it)
751 {
752         struct sock *sk = NULL;
753         kmem_cache_t *slab = prot->slab;
754
755         if (slab != NULL)
756                 sk = kmem_cache_alloc(slab, priority);
757         else
758                 sk = kmalloc(prot->obj_size, priority);
759
760         if (sk) {
761                 if (zero_it) {
762                         memset(sk, 0, prot->obj_size);
763                         sk->sk_family = family;
764                         /*
765                          * See comment in struct sock definition to understand
766                          * why we need sk_prot_creator -acme
767                          */
768                         sk->sk_prot = sk->sk_prot_creator = prot;
769                         sock_lock_init(sk);
770                 }
771                 
772                 if (security_sk_alloc(sk, family, priority))
773                         goto out_free;
774
775                 if (!try_module_get(prot->owner))
776                         goto out_free;
777         }
778         return sk;
779
780 out_free:
781         if (slab != NULL)
782                 kmem_cache_free(slab, sk);
783         else
784                 kfree(sk);
785         return NULL;
786 }
787
788 void sk_free(struct sock *sk)
789 {
790         struct sk_filter *filter;
791         struct module *owner = sk->sk_prot_creator->owner;
792
793         if (sk->sk_destruct)
794                 sk->sk_destruct(sk);
795
796         filter = sk->sk_filter;
797         if (filter) {
798                 sk_filter_release(sk, filter);
799                 sk->sk_filter = NULL;
800         }
801
802         sock_disable_timestamp(sk);
803
804         if (atomic_read(&sk->sk_omem_alloc))
805                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
806                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
807
808         security_sk_free(sk);
809         if (sk->sk_prot_creator->slab != NULL)
810                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
811         else
812                 kfree(sk);
813         module_put(owner);
814 }
815
816 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
817 {
818         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
819
820         if (newsk != NULL) {
821                 struct sk_filter *filter;
822
823                 memcpy(newsk, sk, sk->sk_prot->obj_size);
824
825                 /* SANITY */
826                 sk_node_init(&newsk->sk_node);
827                 sock_lock_init(newsk);
828                 bh_lock_sock(newsk);
829
830                 atomic_set(&newsk->sk_rmem_alloc, 0);
831                 atomic_set(&newsk->sk_wmem_alloc, 0);
832                 atomic_set(&newsk->sk_omem_alloc, 0);
833                 skb_queue_head_init(&newsk->sk_receive_queue);
834                 skb_queue_head_init(&newsk->sk_write_queue);
835
836                 rwlock_init(&newsk->sk_dst_lock);
837                 rwlock_init(&newsk->sk_callback_lock);
838
839                 newsk->sk_dst_cache     = NULL;
840                 newsk->sk_wmem_queued   = 0;
841                 newsk->sk_forward_alloc = 0;
842                 newsk->sk_send_head     = NULL;
843                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
844                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
845
846                 sock_reset_flag(newsk, SOCK_DONE);
847                 skb_queue_head_init(&newsk->sk_error_queue);
848
849                 filter = newsk->sk_filter;
850                 if (filter != NULL)
851                         sk_filter_charge(newsk, filter);
852
853                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
854                         /* It is still raw copy of parent, so invalidate
855                          * destructor and make plain sk_free() */
856                         newsk->sk_destruct = NULL;
857                         sk_free(newsk);
858                         newsk = NULL;
859                         goto out;
860                 }
861
862                 newsk->sk_err      = 0;
863                 newsk->sk_priority = 0;
864                 atomic_set(&newsk->sk_refcnt, 2);
865
866                 /*
867                  * Increment the counter in the same struct proto as the master
868                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
869                  * is the same as sk->sk_prot->socks, as this field was copied
870                  * with memcpy).
871                  *
872                  * This _changes_ the previous behaviour, where
873                  * tcp_create_openreq_child always was incrementing the
874                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
875                  * to be taken into account in all callers. -acme
876                  */
877                 sk_refcnt_debug_inc(newsk);
878                 newsk->sk_socket = NULL;
879                 newsk->sk_sleep  = NULL;
880
881                 if (newsk->sk_prot->sockets_allocated)
882                         atomic_inc(newsk->sk_prot->sockets_allocated);
883         }
884 out:
885         return newsk;
886 }
887
888 EXPORT_SYMBOL_GPL(sk_clone);
889
890 void __init sk_init(void)
891 {
892         if (num_physpages <= 4096) {
893                 sysctl_wmem_max = 32767;
894                 sysctl_rmem_max = 32767;
895                 sysctl_wmem_default = 32767;
896                 sysctl_rmem_default = 32767;
897         } else if (num_physpages >= 131072) {
898                 sysctl_wmem_max = 131071;
899                 sysctl_rmem_max = 131071;
900         }
901 }
902
903 /*
904  *      Simple resource managers for sockets.
905  */
906
907
908 /* 
909  * Write buffer destructor automatically called from kfree_skb. 
910  */
911 void sock_wfree(struct sk_buff *skb)
912 {
913         struct sock *sk = skb->sk;
914
915         /* In case it might be waiting for more memory. */
916         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
917         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
918                 sk->sk_write_space(sk);
919         sock_put(sk);
920 }
921
922 /* 
923  * Read buffer destructor automatically called from kfree_skb. 
924  */
925 void sock_rfree(struct sk_buff *skb)
926 {
927         struct sock *sk = skb->sk;
928
929         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
930 }
931
932
933 int sock_i_uid(struct sock *sk)
934 {
935         int uid;
936
937         read_lock(&sk->sk_callback_lock);
938         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
939         read_unlock(&sk->sk_callback_lock);
940         return uid;
941 }
942
943 unsigned long sock_i_ino(struct sock *sk)
944 {
945         unsigned long ino;
946
947         read_lock(&sk->sk_callback_lock);
948         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
949         read_unlock(&sk->sk_callback_lock);
950         return ino;
951 }
952
953 /*
954  * Allocate a skb from the socket's send buffer.
955  */
956 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
957                              gfp_t priority)
958 {
959         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
960                 struct sk_buff * skb = alloc_skb(size, priority);
961                 if (skb) {
962                         skb_set_owner_w(skb, sk);
963                         return skb;
964                 }
965         }
966         return NULL;
967 }
968
969 /*
970  * Allocate a skb from the socket's receive buffer.
971  */ 
972 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
973                              gfp_t priority)
974 {
975         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
976                 struct sk_buff *skb = alloc_skb(size, priority);
977                 if (skb) {
978                         skb_set_owner_r(skb, sk);
979                         return skb;
980                 }
981         }
982         return NULL;
983 }
984
985 /* 
986  * Allocate a memory block from the socket's option memory buffer.
987  */ 
988 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
989 {
990         if ((unsigned)size <= sysctl_optmem_max &&
991             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
992                 void *mem;
993                 /* First do the add, to avoid the race if kmalloc
994                  * might sleep.
995                  */
996                 atomic_add(size, &sk->sk_omem_alloc);
997                 mem = kmalloc(size, priority);
998                 if (mem)
999                         return mem;
1000                 atomic_sub(size, &sk->sk_omem_alloc);
1001         }
1002         return NULL;
1003 }
1004
1005 /*
1006  * Free an option memory block.
1007  */
1008 void sock_kfree_s(struct sock *sk, void *mem, int size)
1009 {
1010         kfree(mem);
1011         atomic_sub(size, &sk->sk_omem_alloc);
1012 }
1013
1014 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1015    I think, these locks should be removed for datagram sockets.
1016  */
1017 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1018 {
1019         DEFINE_WAIT(wait);
1020
1021         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1022         for (;;) {
1023                 if (!timeo)
1024                         break;
1025                 if (signal_pending(current))
1026                         break;
1027                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1028                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1029                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1030                         break;
1031                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1032                         break;
1033                 if (sk->sk_err)
1034                         break;
1035                 timeo = schedule_timeout(timeo);
1036         }
1037         finish_wait(sk->sk_sleep, &wait);
1038         return timeo;
1039 }
1040
1041
1042 /*
1043  *      Generic send/receive buffer handlers
1044  */
1045
1046 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1047                                             unsigned long header_len,
1048                                             unsigned long data_len,
1049                                             int noblock, int *errcode)
1050 {
1051         struct sk_buff *skb;
1052         gfp_t gfp_mask;
1053         long timeo;
1054         int err;
1055
1056         gfp_mask = sk->sk_allocation;
1057         if (gfp_mask & __GFP_WAIT)
1058                 gfp_mask |= __GFP_REPEAT;
1059
1060         timeo = sock_sndtimeo(sk, noblock);
1061         while (1) {
1062                 err = sock_error(sk);
1063                 if (err != 0)
1064                         goto failure;
1065
1066                 err = -EPIPE;
1067                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1068                         goto failure;
1069
1070                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1071                         skb = alloc_skb(header_len, sk->sk_allocation);
1072                         if (skb) {
1073                                 int npages;
1074                                 int i;
1075
1076                                 /* No pages, we're done... */
1077                                 if (!data_len)
1078                                         break;
1079
1080                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1081                                 skb->truesize += data_len;
1082                                 skb_shinfo(skb)->nr_frags = npages;
1083                                 for (i = 0; i < npages; i++) {
1084                                         struct page *page;
1085                                         skb_frag_t *frag;
1086
1087                                         page = alloc_pages(sk->sk_allocation, 0);
1088                                         if (!page) {
1089                                                 err = -ENOBUFS;
1090                                                 skb_shinfo(skb)->nr_frags = i;
1091                                                 kfree_skb(skb);
1092                                                 goto failure;
1093                                         }
1094
1095                                         frag = &skb_shinfo(skb)->frags[i];
1096                                         frag->page = page;
1097                                         frag->page_offset = 0;
1098                                         frag->size = (data_len >= PAGE_SIZE ?
1099                                                       PAGE_SIZE :
1100                                                       data_len);
1101                                         data_len -= PAGE_SIZE;
1102                                 }
1103
1104                                 /* Full success... */
1105                                 break;
1106                         }
1107                         err = -ENOBUFS;
1108                         goto failure;
1109                 }
1110                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1111                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1112                 err = -EAGAIN;
1113                 if (!timeo)
1114                         goto failure;
1115                 if (signal_pending(current))
1116                         goto interrupted;
1117                 timeo = sock_wait_for_wmem(sk, timeo);
1118         }
1119
1120         skb_set_owner_w(skb, sk);
1121         return skb;
1122
1123 interrupted:
1124         err = sock_intr_errno(timeo);
1125 failure:
1126         *errcode = err;
1127         return NULL;
1128 }
1129
1130 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
1131                                     int noblock, int *errcode)
1132 {
1133         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1134 }
1135
1136 static void __lock_sock(struct sock *sk)
1137 {
1138         DEFINE_WAIT(wait);
1139
1140         for(;;) {
1141                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1142                                         TASK_UNINTERRUPTIBLE);
1143                 spin_unlock_bh(&sk->sk_lock.slock);
1144                 schedule();
1145                 spin_lock_bh(&sk->sk_lock.slock);
1146                 if(!sock_owned_by_user(sk))
1147                         break;
1148         }
1149         finish_wait(&sk->sk_lock.wq, &wait);
1150 }
1151
1152 static void __release_sock(struct sock *sk)
1153 {
1154         struct sk_buff *skb = sk->sk_backlog.head;
1155
1156         do {
1157                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1158                 bh_unlock_sock(sk);
1159
1160                 do {
1161                         struct sk_buff *next = skb->next;
1162
1163                         skb->next = NULL;
1164                         sk->sk_backlog_rcv(sk, skb);
1165
1166                         /*
1167                          * We are in process context here with softirqs
1168                          * disabled, use cond_resched_softirq() to preempt.
1169                          * This is safe to do because we've taken the backlog
1170                          * queue private:
1171                          */
1172                         cond_resched_softirq();
1173
1174                         skb = next;
1175                 } while (skb != NULL);
1176
1177                 bh_lock_sock(sk);
1178         } while((skb = sk->sk_backlog.head) != NULL);
1179 }
1180
1181 /**
1182  * sk_wait_data - wait for data to arrive at sk_receive_queue
1183  * @sk:    sock to wait on
1184  * @timeo: for how long
1185  *
1186  * Now socket state including sk->sk_err is changed only under lock,
1187  * hence we may omit checks after joining wait queue.
1188  * We check receive queue before schedule() only as optimization;
1189  * it is very likely that release_sock() added new data.
1190  */
1191 int sk_wait_data(struct sock *sk, long *timeo)
1192 {
1193         int rc;
1194         DEFINE_WAIT(wait);
1195
1196         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1197         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1198         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1199         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1200         finish_wait(sk->sk_sleep, &wait);
1201         return rc;
1202 }
1203
1204 EXPORT_SYMBOL(sk_wait_data);
1205
1206 /*
1207  * Set of default routines for initialising struct proto_ops when
1208  * the protocol does not support a particular function. In certain
1209  * cases where it makes no sense for a protocol to have a "do nothing"
1210  * function, some default processing is provided.
1211  */
1212
1213 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1214 {
1215         return -EOPNOTSUPP;
1216 }
1217
1218 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
1219                     int len, int flags)
1220 {
1221         return -EOPNOTSUPP;
1222 }
1223
1224 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1225 {
1226         return -EOPNOTSUPP;
1227 }
1228
1229 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1230 {
1231         return -EOPNOTSUPP;
1232 }
1233
1234 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
1235                     int *len, int peer)
1236 {
1237         return -EOPNOTSUPP;
1238 }
1239
1240 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1241 {
1242         return 0;
1243 }
1244
1245 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1246 {
1247         return -EOPNOTSUPP;
1248 }
1249
1250 int sock_no_listen(struct socket *sock, int backlog)
1251 {
1252         return -EOPNOTSUPP;
1253 }
1254
1255 int sock_no_shutdown(struct socket *sock, int how)
1256 {
1257         return -EOPNOTSUPP;
1258 }
1259
1260 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1261                     char __user *optval, int optlen)
1262 {
1263         return -EOPNOTSUPP;
1264 }
1265
1266 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1267                     char __user *optval, int __user *optlen)
1268 {
1269         return -EOPNOTSUPP;
1270 }
1271
1272 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1273                     size_t len)
1274 {
1275         return -EOPNOTSUPP;
1276 }
1277
1278 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1279                     size_t len, int flags)
1280 {
1281         return -EOPNOTSUPP;
1282 }
1283
1284 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1285 {
1286         /* Mirror missing mmap method error code */
1287         return -ENODEV;
1288 }
1289
1290 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1291 {
1292         ssize_t res;
1293         struct msghdr msg = {.msg_flags = flags};
1294         struct kvec iov;
1295         char *kaddr = kmap(page);
1296         iov.iov_base = kaddr + offset;
1297         iov.iov_len = size;
1298         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1299         kunmap(page);
1300         return res;
1301 }
1302
1303 /*
1304  *      Default Socket Callbacks
1305  */
1306
1307 static void sock_def_wakeup(struct sock *sk)
1308 {
1309         read_lock(&sk->sk_callback_lock);
1310         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1311                 wake_up_interruptible_all(sk->sk_sleep);
1312         read_unlock(&sk->sk_callback_lock);
1313 }
1314
1315 static void sock_def_error_report(struct sock *sk)
1316 {
1317         read_lock(&sk->sk_callback_lock);
1318         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1319                 wake_up_interruptible(sk->sk_sleep);
1320         sk_wake_async(sk,0,POLL_ERR); 
1321         read_unlock(&sk->sk_callback_lock);
1322 }
1323
1324 static void sock_def_readable(struct sock *sk, int len)
1325 {
1326         read_lock(&sk->sk_callback_lock);
1327         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1328                 wake_up_interruptible(sk->sk_sleep);
1329         sk_wake_async(sk,1,POLL_IN);
1330         read_unlock(&sk->sk_callback_lock);
1331 }
1332
1333 static void sock_def_write_space(struct sock *sk)
1334 {
1335         read_lock(&sk->sk_callback_lock);
1336
1337         /* Do not wake up a writer until he can make "significant"
1338          * progress.  --DaveM
1339          */
1340         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1341                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1342                         wake_up_interruptible(sk->sk_sleep);
1343
1344                 /* Should agree with poll, otherwise some programs break */
1345                 if (sock_writeable(sk))
1346                         sk_wake_async(sk, 2, POLL_OUT);
1347         }
1348
1349         read_unlock(&sk->sk_callback_lock);
1350 }
1351
1352 static void sock_def_destruct(struct sock *sk)
1353 {
1354         kfree(sk->sk_protinfo);
1355 }
1356
1357 void sk_send_sigurg(struct sock *sk)
1358 {
1359         if (sk->sk_socket && sk->sk_socket->file)
1360                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1361                         sk_wake_async(sk, 3, POLL_PRI);
1362 }
1363
1364 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1365                     unsigned long expires)
1366 {
1367         if (!mod_timer(timer, expires))
1368                 sock_hold(sk);
1369 }
1370
1371 EXPORT_SYMBOL(sk_reset_timer);
1372
1373 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1374 {
1375         if (timer_pending(timer) && del_timer(timer))
1376                 __sock_put(sk);
1377 }
1378
1379 EXPORT_SYMBOL(sk_stop_timer);
1380
1381 void sock_init_data(struct socket *sock, struct sock *sk)
1382 {
1383         skb_queue_head_init(&sk->sk_receive_queue);
1384         skb_queue_head_init(&sk->sk_write_queue);
1385         skb_queue_head_init(&sk->sk_error_queue);
1386
1387         sk->sk_send_head        =       NULL;
1388
1389         init_timer(&sk->sk_timer);
1390         
1391         sk->sk_allocation       =       GFP_KERNEL;
1392         sk->sk_rcvbuf           =       sysctl_rmem_default;
1393         sk->sk_sndbuf           =       sysctl_wmem_default;
1394         sk->sk_state            =       TCP_CLOSE;
1395         sk->sk_socket           =       sock;
1396
1397         sock_set_flag(sk, SOCK_ZAPPED);
1398
1399         if(sock)
1400         {
1401                 sk->sk_type     =       sock->type;
1402                 sk->sk_sleep    =       &sock->wait;
1403                 sock->sk        =       sk;
1404         } else
1405                 sk->sk_sleep    =       NULL;
1406
1407         rwlock_init(&sk->sk_dst_lock);
1408         rwlock_init(&sk->sk_callback_lock);
1409
1410         sk->sk_state_change     =       sock_def_wakeup;
1411         sk->sk_data_ready       =       sock_def_readable;
1412         sk->sk_write_space      =       sock_def_write_space;
1413         sk->sk_error_report     =       sock_def_error_report;
1414         sk->sk_destruct         =       sock_def_destruct;
1415
1416         sk->sk_sndmsg_page      =       NULL;
1417         sk->sk_sndmsg_off       =       0;
1418
1419         sk->sk_peercred.pid     =       0;
1420         sk->sk_peercred.uid     =       -1;
1421         sk->sk_peercred.gid     =       -1;
1422         sk->sk_write_pending    =       0;
1423         sk->sk_rcvlowat         =       1;
1424         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1425         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1426
1427         sk->sk_stamp.tv_sec     = -1L;
1428         sk->sk_stamp.tv_usec    = -1L;
1429
1430         atomic_set(&sk->sk_refcnt, 1);
1431 }
1432
1433 void fastcall lock_sock(struct sock *sk)
1434 {
1435         might_sleep();
1436         spin_lock_bh(&(sk->sk_lock.slock));
1437         if (sk->sk_lock.owner)
1438                 __lock_sock(sk);
1439         sk->sk_lock.owner = (void *)1;
1440         spin_unlock_bh(&(sk->sk_lock.slock));
1441 }
1442
1443 EXPORT_SYMBOL(lock_sock);
1444
1445 void fastcall release_sock(struct sock *sk)
1446 {
1447         spin_lock_bh(&(sk->sk_lock.slock));
1448         if (sk->sk_backlog.tail)
1449                 __release_sock(sk);
1450         sk->sk_lock.owner = NULL;
1451         if (waitqueue_active(&(sk->sk_lock.wq)))
1452                 wake_up(&(sk->sk_lock.wq));
1453         spin_unlock_bh(&(sk->sk_lock.slock));
1454 }
1455 EXPORT_SYMBOL(release_sock);
1456
1457 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1458
1459         if (!sock_flag(sk, SOCK_TIMESTAMP))
1460                 sock_enable_timestamp(sk);
1461         if (sk->sk_stamp.tv_sec == -1) 
1462                 return -ENOENT;
1463         if (sk->sk_stamp.tv_sec == 0)
1464                 do_gettimeofday(&sk->sk_stamp);
1465         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1466                 -EFAULT : 0; 
1467
1468 EXPORT_SYMBOL(sock_get_timestamp);
1469
1470 void sock_enable_timestamp(struct sock *sk)
1471 {       
1472         if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
1473                 sock_set_flag(sk, SOCK_TIMESTAMP);
1474                 net_enable_timestamp();
1475         }
1476 }
1477 EXPORT_SYMBOL(sock_enable_timestamp); 
1478
1479 /*
1480  *      Get a socket option on an socket.
1481  *
1482  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1483  *      asynchronous errors should be reported by getsockopt. We assume
1484  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1485  */
1486 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1487                            char __user *optval, int __user *optlen)
1488 {
1489         struct sock *sk = sock->sk;
1490
1491         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1492 }
1493
1494 EXPORT_SYMBOL(sock_common_getsockopt);
1495
1496 #ifdef CONFIG_COMPAT
1497 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1498                                   char __user *optval, int __user *optlen)
1499 {
1500         struct sock *sk = sock->sk;
1501
1502         if (sk->sk_prot->compat_setsockopt != NULL)
1503                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1504                                                       optval, optlen);
1505         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1506 }
1507 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1508 #endif
1509
1510 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1511                         struct msghdr *msg, size_t size, int flags)
1512 {
1513         struct sock *sk = sock->sk;
1514         int addr_len = 0;
1515         int err;
1516
1517         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1518                                    flags & ~MSG_DONTWAIT, &addr_len);
1519         if (err >= 0)
1520                 msg->msg_namelen = addr_len;
1521         return err;
1522 }
1523
1524 EXPORT_SYMBOL(sock_common_recvmsg);
1525
1526 /*
1527  *      Set socket options on an inet socket.
1528  */
1529 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1530                            char __user *optval, int optlen)
1531 {
1532         struct sock *sk = sock->sk;
1533
1534         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1535 }
1536
1537 EXPORT_SYMBOL(sock_common_setsockopt);
1538
1539 #ifdef CONFIG_COMPAT
1540 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1541                                   char __user *optval, int optlen)
1542 {
1543         struct sock *sk = sock->sk;
1544
1545         if (sk->sk_prot->compat_setsockopt != NULL)
1546                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1547                                                       optval, optlen);
1548         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1549 }
1550 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1551 #endif
1552
1553 void sk_common_release(struct sock *sk)
1554 {
1555         if (sk->sk_prot->destroy)
1556                 sk->sk_prot->destroy(sk);
1557
1558         /*
1559          * Observation: when sock_common_release is called, processes have
1560          * no access to socket. But net still has.
1561          * Step one, detach it from networking:
1562          *
1563          * A. Remove from hash tables.
1564          */
1565
1566         sk->sk_prot->unhash(sk);
1567
1568         /*
1569          * In this point socket cannot receive new packets, but it is possible
1570          * that some packets are in flight because some CPU runs receiver and
1571          * did hash table lookup before we unhashed socket. They will achieve
1572          * receive queue and will be purged by socket destructor.
1573          *
1574          * Also we still have packets pending on receive queue and probably,
1575          * our own packets waiting in device queues. sock_destroy will drain
1576          * receive queue, but transmitted packets will delay socket destruction
1577          * until the last reference will be released.
1578          */
1579
1580         sock_orphan(sk);
1581
1582         xfrm_sk_free_policy(sk);
1583
1584         sk_refcnt_debug_release(sk);
1585         sock_put(sk);
1586 }
1587
1588 EXPORT_SYMBOL(sk_common_release);
1589
1590 static DEFINE_RWLOCK(proto_list_lock);
1591 static LIST_HEAD(proto_list);
1592
1593 int proto_register(struct proto *prot, int alloc_slab)
1594 {
1595         char *request_sock_slab_name = NULL;
1596         char *timewait_sock_slab_name;
1597         int rc = -ENOBUFS;
1598
1599         if (alloc_slab) {
1600                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1601                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1602
1603                 if (prot->slab == NULL) {
1604                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1605                                prot->name);
1606                         goto out;
1607                 }
1608
1609                 if (prot->rsk_prot != NULL) {
1610                         static const char mask[] = "request_sock_%s";
1611
1612                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1613                         if (request_sock_slab_name == NULL)
1614                                 goto out_free_sock_slab;
1615
1616                         sprintf(request_sock_slab_name, mask, prot->name);
1617                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1618                                                                  prot->rsk_prot->obj_size, 0,
1619                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1620
1621                         if (prot->rsk_prot->slab == NULL) {
1622                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1623                                        prot->name);
1624                                 goto out_free_request_sock_slab_name;
1625                         }
1626                 }
1627
1628                 if (prot->twsk_prot != NULL) {
1629                         static const char mask[] = "tw_sock_%s";
1630
1631                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1632
1633                         if (timewait_sock_slab_name == NULL)
1634                                 goto out_free_request_sock_slab;
1635
1636                         sprintf(timewait_sock_slab_name, mask, prot->name);
1637                         prot->twsk_prot->twsk_slab =
1638                                 kmem_cache_create(timewait_sock_slab_name,
1639                                                   prot->twsk_prot->twsk_obj_size,
1640                                                   0, SLAB_HWCACHE_ALIGN,
1641                                                   NULL, NULL);
1642                         if (prot->twsk_prot->twsk_slab == NULL)
1643                                 goto out_free_timewait_sock_slab_name;
1644                 }
1645         }
1646
1647         write_lock(&proto_list_lock);
1648         list_add(&prot->node, &proto_list);
1649         write_unlock(&proto_list_lock);
1650         rc = 0;
1651 out:
1652         return rc;
1653 out_free_timewait_sock_slab_name:
1654         kfree(timewait_sock_slab_name);
1655 out_free_request_sock_slab:
1656         if (prot->rsk_prot && prot->rsk_prot->slab) {
1657                 kmem_cache_destroy(prot->rsk_prot->slab);
1658                 prot->rsk_prot->slab = NULL;
1659         }
1660 out_free_request_sock_slab_name:
1661         kfree(request_sock_slab_name);
1662 out_free_sock_slab:
1663         kmem_cache_destroy(prot->slab);
1664         prot->slab = NULL;
1665         goto out;
1666 }
1667
1668 EXPORT_SYMBOL(proto_register);
1669
1670 void proto_unregister(struct proto *prot)
1671 {
1672         write_lock(&proto_list_lock);
1673         list_del(&prot->node);
1674         write_unlock(&proto_list_lock);
1675
1676         if (prot->slab != NULL) {
1677                 kmem_cache_destroy(prot->slab);
1678                 prot->slab = NULL;
1679         }
1680
1681         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1682                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1683
1684                 kmem_cache_destroy(prot->rsk_prot->slab);
1685                 kfree(name);
1686                 prot->rsk_prot->slab = NULL;
1687         }
1688
1689         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1690                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1691
1692                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1693                 kfree(name);
1694                 prot->twsk_prot->twsk_slab = NULL;
1695         }
1696 }
1697
1698 EXPORT_SYMBOL(proto_unregister);
1699
1700 #ifdef CONFIG_PROC_FS
1701 static inline struct proto *__proto_head(void)
1702 {
1703         return list_entry(proto_list.next, struct proto, node);
1704 }
1705
1706 static inline struct proto *proto_head(void)
1707 {
1708         return list_empty(&proto_list) ? NULL : __proto_head();
1709 }
1710
1711 static inline struct proto *proto_next(struct proto *proto)
1712 {
1713         return proto->node.next == &proto_list ? NULL :
1714                 list_entry(proto->node.next, struct proto, node);
1715 }
1716
1717 static inline struct proto *proto_get_idx(loff_t pos)
1718 {
1719         struct proto *proto;
1720         loff_t i = 0;
1721
1722         list_for_each_entry(proto, &proto_list, node)
1723                 if (i++ == pos)
1724                         goto out;
1725
1726         proto = NULL;
1727 out:
1728         return proto;
1729 }
1730
1731 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1732 {
1733         read_lock(&proto_list_lock);
1734         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1735 }
1736
1737 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1738 {
1739         ++*pos;
1740         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1741 }
1742
1743 static void proto_seq_stop(struct seq_file *seq, void *v)
1744 {
1745         read_unlock(&proto_list_lock);
1746 }
1747
1748 static char proto_method_implemented(const void *method)
1749 {
1750         return method == NULL ? 'n' : 'y';
1751 }
1752
1753 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1754 {
1755         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1756                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1757                    proto->name,
1758                    proto->obj_size,
1759                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1760                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1761                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1762                    proto->max_header,
1763                    proto->slab == NULL ? "no" : "yes",
1764                    module_name(proto->owner),
1765                    proto_method_implemented(proto->close),
1766                    proto_method_implemented(proto->connect),
1767                    proto_method_implemented(proto->disconnect),
1768                    proto_method_implemented(proto->accept),
1769                    proto_method_implemented(proto->ioctl),
1770                    proto_method_implemented(proto->init),
1771                    proto_method_implemented(proto->destroy),
1772                    proto_method_implemented(proto->shutdown),
1773                    proto_method_implemented(proto->setsockopt),
1774                    proto_method_implemented(proto->getsockopt),
1775                    proto_method_implemented(proto->sendmsg),
1776                    proto_method_implemented(proto->recvmsg),
1777                    proto_method_implemented(proto->sendpage),
1778                    proto_method_implemented(proto->bind),
1779                    proto_method_implemented(proto->backlog_rcv),
1780                    proto_method_implemented(proto->hash),
1781                    proto_method_implemented(proto->unhash),
1782                    proto_method_implemented(proto->get_port),
1783                    proto_method_implemented(proto->enter_memory_pressure));
1784 }
1785
1786 static int proto_seq_show(struct seq_file *seq, void *v)
1787 {
1788         if (v == SEQ_START_TOKEN)
1789                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1790                            "protocol",
1791                            "size",
1792                            "sockets",
1793                            "memory",
1794                            "press",
1795                            "maxhdr",
1796                            "slab",
1797                            "module",
1798                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1799         else
1800                 proto_seq_printf(seq, v);
1801         return 0;
1802 }
1803
1804 static struct seq_operations proto_seq_ops = {
1805         .start  = proto_seq_start,
1806         .next   = proto_seq_next,
1807         .stop   = proto_seq_stop,
1808         .show   = proto_seq_show,
1809 };
1810
1811 static int proto_seq_open(struct inode *inode, struct file *file)
1812 {
1813         return seq_open(file, &proto_seq_ops);
1814 }
1815
1816 static struct file_operations proto_seq_fops = {
1817         .owner          = THIS_MODULE,
1818         .open           = proto_seq_open,
1819         .read           = seq_read,
1820         .llseek         = seq_lseek,
1821         .release        = seq_release,
1822 };
1823
1824 static int __init proto_init(void)
1825 {
1826         /* register /proc/net/protocols */
1827         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1828 }
1829
1830 subsys_initcall(proto_init);
1831
1832 #endif /* PROC_FS */
1833
1834 EXPORT_SYMBOL(sk_alloc);
1835 EXPORT_SYMBOL(sk_free);
1836 EXPORT_SYMBOL(sk_send_sigurg);
1837 EXPORT_SYMBOL(sock_alloc_send_skb);
1838 EXPORT_SYMBOL(sock_init_data);
1839 EXPORT_SYMBOL(sock_kfree_s);
1840 EXPORT_SYMBOL(sock_kmalloc);
1841 EXPORT_SYMBOL(sock_no_accept);
1842 EXPORT_SYMBOL(sock_no_bind);
1843 EXPORT_SYMBOL(sock_no_connect);
1844 EXPORT_SYMBOL(sock_no_getname);
1845 EXPORT_SYMBOL(sock_no_getsockopt);
1846 EXPORT_SYMBOL(sock_no_ioctl);
1847 EXPORT_SYMBOL(sock_no_listen);
1848 EXPORT_SYMBOL(sock_no_mmap);
1849 EXPORT_SYMBOL(sock_no_poll);
1850 EXPORT_SYMBOL(sock_no_recvmsg);
1851 EXPORT_SYMBOL(sock_no_sendmsg);
1852 EXPORT_SYMBOL(sock_no_sendpage);
1853 EXPORT_SYMBOL(sock_no_setsockopt);
1854 EXPORT_SYMBOL(sock_no_shutdown);
1855 EXPORT_SYMBOL(sock_no_socketpair);
1856 EXPORT_SYMBOL(sock_rfree);
1857 EXPORT_SYMBOL(sock_setsockopt);
1858 EXPORT_SYMBOL(sock_wfree);
1859 EXPORT_SYMBOL(sock_wmalloc);
1860 EXPORT_SYMBOL(sock_i_uid);
1861 EXPORT_SYMBOL(sock_i_ino);
1862 EXPORT_SYMBOL(sysctl_optmem_max);
1863 #ifdef CONFIG_SYSCTL
1864 EXPORT_SYMBOL(sysctl_rmem_max);
1865 EXPORT_SYMBOL(sysctl_wmem_max);
1866 #endif