Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Florian La Roche, <flla@stud.uni-sb.de>
13  *              Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *              Alan Cox        :       Numerous verify_area() problems
17  *              Alan Cox        :       Connecting on a connecting socket
18  *                                      now returns an error for tcp.
19  *              Alan Cox        :       sock->protocol is set correctly.
20  *                                      and is not sometimes left as 0.
21  *              Alan Cox        :       connect handles icmp errors on a
22  *                                      connect properly. Unfortunately there
23  *                                      is a restart syscall nasty there. I
24  *                                      can't match BSD without hacking the C
25  *                                      library. Ideas urgently sought!
26  *              Alan Cox        :       Disallow bind() to addresses that are
27  *                                      not ours - especially broadcast ones!!
28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
30  *                                      instead they leave that for the DESTROY timer.
31  *              Alan Cox        :       Clean up error flag in accept
32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
33  *                                      was buggy. Put a remove_sock() in the handler
34  *                                      for memory when we hit 0. Also altered the timer
35  *                                      code. The ACK stuff can wait and needs major
36  *                                      TCP layer surgery.
37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
38  *                                      and fixed timer/inet_bh race.
39  *              Alan Cox        :       Added zapped flag for TCP
40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
47  *      Pauline Middelink       :       identd support
48  *              Alan Cox        :       Fixed connect() taking signals I think.
49  *              Alan Cox        :       SO_LINGER supported
50  *              Alan Cox        :       Error reporting fixes
51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
52  *              Alan Cox        :       inet sockets don't set sk->type!
53  *              Alan Cox        :       Split socket option code
54  *              Alan Cox        :       Callbacks
55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
56  *              Alex            :       Removed restriction on inet fioctl
57  *              Alan Cox        :       Splitting INET from NET core
58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
60  *              Alan Cox        :       Split IP from generic code
61  *              Alan Cox        :       New kfree_skbmem()
62  *              Alan Cox        :       Make SO_DEBUG superuser only.
63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
64  *                                      (compatibility fix)
65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
66  *              Alan Cox        :       Allocator for a socket is settable.
67  *              Alan Cox        :       SO_ERROR includes soft errors.
68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
69  *              Alan Cox        :       Generic socket allocation to make hooks
70  *                                      easier (suggested by Craig Metz).
71  *              Michael Pall    :       SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
79  *              Andi Kleen      :       Fix write_space callback
80  *              Chris Evans     :       Security fixes - signedness again
81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *              This program is free software; you can redistribute it and/or
87  *              modify it under the terms of the GNU General Public License
88  *              as published by the Free Software Foundation; either version
89  *              2 of the License, or (at your option) any later version.
90  */
91
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125
126 #include <linux/filter.h>
127
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131
132 /*
133  * Each address family might have different locking rules, so we have
134  * one slock key per address family:
135  */
136 static struct lock_class_key af_family_keys[AF_MAX];
137 static struct lock_class_key af_family_slock_keys[AF_MAX];
138
139 /*
140  * Make lock validator output more readable. (we pre-construct these
141  * strings build-time, so that runtime initialization of socket
142  * locks is fast):
143  */
144 static const char *af_family_key_strings[AF_MAX+1] = {
145   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
146   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
147   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
148   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
149   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
150   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
151   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
152   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
153   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
154   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
155   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
156   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
157   "sk_lock-AF_MAX"
158 };
159 static const char *af_family_slock_key_strings[AF_MAX+1] = {
160   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
170   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
171   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
172   "slock-AF_MAX"
173 };
174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
175   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
176   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
177   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
178   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
179   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
180   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
181   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
182   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
183   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
184   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
185   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
186   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
187   "clock-AF_MAX"
188 };
189
190 /*
191  * sk_callback_lock locking rules are per-address-family,
192  * so split the lock classes by using a per-AF key:
193  */
194 static struct lock_class_key af_callback_keys[AF_MAX];
195
196 /* Take into consideration the size of the struct sk_buff overhead in the
197  * determination of these values, since that is non-constant across
198  * platforms.  This makes socket queueing behavior and performance
199  * not depend upon such differences.
200  */
201 #define _SK_MEM_PACKETS         256
202 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
203 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
204 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205
206 /* Run time adjustable parameters. */
207 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
208 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
209 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
210 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
211
212 /* Maximal space eaten by iovec or ancilliary data plus some space */
213 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
214
215 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
216 {
217         struct timeval tv;
218
219         if (optlen < sizeof(tv))
220                 return -EINVAL;
221         if (copy_from_user(&tv, optval, sizeof(tv)))
222                 return -EFAULT;
223         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
224                 return -EDOM;
225
226         if (tv.tv_sec < 0) {
227                 static int warned __read_mostly;
228
229                 *timeo_p = 0;
230                 if (warned < 10 && net_ratelimit()) {
231                         warned++;
232                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
233                                "tries to set negative timeout\n",
234                                 current->comm, task_pid_nr(current));
235                 }
236                 return 0;
237         }
238         *timeo_p = MAX_SCHEDULE_TIMEOUT;
239         if (tv.tv_sec == 0 && tv.tv_usec == 0)
240                 return 0;
241         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
242                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
243         return 0;
244 }
245
246 static void sock_warn_obsolete_bsdism(const char *name)
247 {
248         static int warned;
249         static char warncomm[TASK_COMM_LEN];
250         if (strcmp(warncomm, current->comm) && warned < 5) {
251                 strcpy(warncomm,  current->comm);
252                 printk(KERN_WARNING "process `%s' is using obsolete "
253                        "%s SO_BSDCOMPAT\n", warncomm, name);
254                 warned++;
255         }
256 }
257
258 static void sock_disable_timestamp(struct sock *sk)
259 {
260         if (sock_flag(sk, SOCK_TIMESTAMP)) {
261                 sock_reset_flag(sk, SOCK_TIMESTAMP);
262                 net_disable_timestamp();
263         }
264 }
265
266
267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268 {
269         int err = 0;
270         int skb_len;
271
272         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
273            number of warnings when compiling with -W --ANK
274          */
275         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
276             (unsigned)sk->sk_rcvbuf) {
277                 err = -ENOMEM;
278                 goto out;
279         }
280
281         err = sk_filter(sk, skb);
282         if (err)
283                 goto out;
284
285         if (!sk_rmem_schedule(sk, skb->truesize)) {
286                 err = -ENOBUFS;
287                 goto out;
288         }
289
290         skb->dev = NULL;
291         skb_set_owner_r(skb, sk);
292         /*
293          * release dst right now while its hot
294          */
295         dst_release(skb->dst);
296         skb->dst = NULL;
297         /* Cache the SKB length before we tack it onto the receive
298          * queue.  Once it is added it no longer belongs to us and
299          * may be freed by other threads of control pulling packets
300          * from the queue.
301          */
302         skb_len = skb->len;
303
304         skb_queue_tail(&sk->sk_receive_queue, skb);
305
306         if (!sock_flag(sk, SOCK_DEAD))
307                 sk->sk_data_ready(sk, skb_len);
308 out:
309         return err;
310 }
311 EXPORT_SYMBOL(sock_queue_rcv_skb);
312
313 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
314 {
315         int rc = NET_RX_SUCCESS;
316
317         if (sk_filter(sk, skb))
318                 goto discard_and_relse;
319
320         skb->dev = NULL;
321
322         if (nested)
323                 bh_lock_sock_nested(sk);
324         else
325                 bh_lock_sock(sk);
326         if (!sock_owned_by_user(sk)) {
327                 /*
328                  * trylock + unlock semantics:
329                  */
330                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
331
332                 rc = sk_backlog_rcv(sk, skb);
333
334                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
335         } else
336                 sk_add_backlog(sk, skb);
337         bh_unlock_sock(sk);
338 out:
339         sock_put(sk);
340         return rc;
341 discard_and_relse:
342         kfree_skb(skb);
343         goto out;
344 }
345 EXPORT_SYMBOL(sk_receive_skb);
346
347 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
348 {
349         struct dst_entry *dst = sk->sk_dst_cache;
350
351         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
352                 sk->sk_dst_cache = NULL;
353                 dst_release(dst);
354                 return NULL;
355         }
356
357         return dst;
358 }
359 EXPORT_SYMBOL(__sk_dst_check);
360
361 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
362 {
363         struct dst_entry *dst = sk_dst_get(sk);
364
365         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
366                 sk_dst_reset(sk);
367                 dst_release(dst);
368                 return NULL;
369         }
370
371         return dst;
372 }
373 EXPORT_SYMBOL(sk_dst_check);
374
375 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
376 {
377         int ret = -ENOPROTOOPT;
378 #ifdef CONFIG_NETDEVICES
379         struct net *net = sock_net(sk);
380         char devname[IFNAMSIZ];
381         int index;
382
383         /* Sorry... */
384         ret = -EPERM;
385         if (!capable(CAP_NET_RAW))
386                 goto out;
387
388         ret = -EINVAL;
389         if (optlen < 0)
390                 goto out;
391
392         /* Bind this socket to a particular device like "eth0",
393          * as specified in the passed interface name. If the
394          * name is "" or the option length is zero the socket
395          * is not bound.
396          */
397         if (optlen > IFNAMSIZ - 1)
398                 optlen = IFNAMSIZ - 1;
399         memset(devname, 0, sizeof(devname));
400
401         ret = -EFAULT;
402         if (copy_from_user(devname, optval, optlen))
403                 goto out;
404
405         if (devname[0] == '\0') {
406                 index = 0;
407         } else {
408                 struct net_device *dev = dev_get_by_name(net, devname);
409
410                 ret = -ENODEV;
411                 if (!dev)
412                         goto out;
413
414                 index = dev->ifindex;
415                 dev_put(dev);
416         }
417
418         lock_sock(sk);
419         sk->sk_bound_dev_if = index;
420         sk_dst_reset(sk);
421         release_sock(sk);
422
423         ret = 0;
424
425 out:
426 #endif
427
428         return ret;
429 }
430
431 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
432 {
433         if (valbool)
434                 sock_set_flag(sk, bit);
435         else
436                 sock_reset_flag(sk, bit);
437 }
438
439 /*
440  *      This is meant for all protocols to use and covers goings on
441  *      at the socket level. Everything here is generic.
442  */
443
444 int sock_setsockopt(struct socket *sock, int level, int optname,
445                     char __user *optval, int optlen)
446 {
447         struct sock *sk=sock->sk;
448         int val;
449         int valbool;
450         struct linger ling;
451         int ret = 0;
452
453         /*
454          *      Options without arguments
455          */
456
457         if (optname == SO_BINDTODEVICE)
458                 return sock_bindtodevice(sk, optval, optlen);
459
460         if (optlen < sizeof(int))
461                 return -EINVAL;
462
463         if (get_user(val, (int __user *)optval))
464                 return -EFAULT;
465
466         valbool = val?1:0;
467
468         lock_sock(sk);
469
470         switch(optname) {
471         case SO_DEBUG:
472                 if (val && !capable(CAP_NET_ADMIN)) {
473                         ret = -EACCES;
474                 } else
475                         sock_valbool_flag(sk, SOCK_DBG, valbool);
476                 break;
477         case SO_REUSEADDR:
478                 sk->sk_reuse = valbool;
479                 break;
480         case SO_TYPE:
481         case SO_ERROR:
482                 ret = -ENOPROTOOPT;
483                 break;
484         case SO_DONTROUTE:
485                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
486                 break;
487         case SO_BROADCAST:
488                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
489                 break;
490         case SO_SNDBUF:
491                 /* Don't error on this BSD doesn't and if you think
492                    about it this is right. Otherwise apps have to
493                    play 'guess the biggest size' games. RCVBUF/SNDBUF
494                    are treated in BSD as hints */
495
496                 if (val > sysctl_wmem_max)
497                         val = sysctl_wmem_max;
498 set_sndbuf:
499                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
500                 if ((val * 2) < SOCK_MIN_SNDBUF)
501                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
502                 else
503                         sk->sk_sndbuf = val * 2;
504
505                 /*
506                  *      Wake up sending tasks if we
507                  *      upped the value.
508                  */
509                 sk->sk_write_space(sk);
510                 break;
511
512         case SO_SNDBUFFORCE:
513                 if (!capable(CAP_NET_ADMIN)) {
514                         ret = -EPERM;
515                         break;
516                 }
517                 goto set_sndbuf;
518
519         case SO_RCVBUF:
520                 /* Don't error on this BSD doesn't and if you think
521                    about it this is right. Otherwise apps have to
522                    play 'guess the biggest size' games. RCVBUF/SNDBUF
523                    are treated in BSD as hints */
524
525                 if (val > sysctl_rmem_max)
526                         val = sysctl_rmem_max;
527 set_rcvbuf:
528                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
529                 /*
530                  * We double it on the way in to account for
531                  * "struct sk_buff" etc. overhead.   Applications
532                  * assume that the SO_RCVBUF setting they make will
533                  * allow that much actual data to be received on that
534                  * socket.
535                  *
536                  * Applications are unaware that "struct sk_buff" and
537                  * other overheads allocate from the receive buffer
538                  * during socket buffer allocation.
539                  *
540                  * And after considering the possible alternatives,
541                  * returning the value we actually used in getsockopt
542                  * is the most desirable behavior.
543                  */
544                 if ((val * 2) < SOCK_MIN_RCVBUF)
545                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
546                 else
547                         sk->sk_rcvbuf = val * 2;
548                 break;
549
550         case SO_RCVBUFFORCE:
551                 if (!capable(CAP_NET_ADMIN)) {
552                         ret = -EPERM;
553                         break;
554                 }
555                 goto set_rcvbuf;
556
557         case SO_KEEPALIVE:
558 #ifdef CONFIG_INET
559                 if (sk->sk_protocol == IPPROTO_TCP)
560                         tcp_set_keepalive(sk, valbool);
561 #endif
562                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
563                 break;
564
565         case SO_OOBINLINE:
566                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
567                 break;
568
569         case SO_NO_CHECK:
570                 sk->sk_no_check = valbool;
571                 break;
572
573         case SO_PRIORITY:
574                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
575                         sk->sk_priority = val;
576                 else
577                         ret = -EPERM;
578                 break;
579
580         case SO_LINGER:
581                 if (optlen < sizeof(ling)) {
582                         ret = -EINVAL;  /* 1003.1g */
583                         break;
584                 }
585                 if (copy_from_user(&ling,optval,sizeof(ling))) {
586                         ret = -EFAULT;
587                         break;
588                 }
589                 if (!ling.l_onoff)
590                         sock_reset_flag(sk, SOCK_LINGER);
591                 else {
592 #if (BITS_PER_LONG == 32)
593                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
594                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
595                         else
596 #endif
597                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
598                         sock_set_flag(sk, SOCK_LINGER);
599                 }
600                 break;
601
602         case SO_BSDCOMPAT:
603                 sock_warn_obsolete_bsdism("setsockopt");
604                 break;
605
606         case SO_PASSCRED:
607                 if (valbool)
608                         set_bit(SOCK_PASSCRED, &sock->flags);
609                 else
610                         clear_bit(SOCK_PASSCRED, &sock->flags);
611                 break;
612
613         case SO_TIMESTAMP:
614         case SO_TIMESTAMPNS:
615                 if (valbool)  {
616                         if (optname == SO_TIMESTAMP)
617                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
618                         else
619                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
620                         sock_set_flag(sk, SOCK_RCVTSTAMP);
621                         sock_enable_timestamp(sk);
622                 } else {
623                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
624                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
625                 }
626                 break;
627
628         case SO_RCVLOWAT:
629                 if (val < 0)
630                         val = INT_MAX;
631                 sk->sk_rcvlowat = val ? : 1;
632                 break;
633
634         case SO_RCVTIMEO:
635                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
636                 break;
637
638         case SO_SNDTIMEO:
639                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
640                 break;
641
642         case SO_ATTACH_FILTER:
643                 ret = -EINVAL;
644                 if (optlen == sizeof(struct sock_fprog)) {
645                         struct sock_fprog fprog;
646
647                         ret = -EFAULT;
648                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
649                                 break;
650
651                         ret = sk_attach_filter(&fprog, sk);
652                 }
653                 break;
654
655         case SO_DETACH_FILTER:
656                 ret = sk_detach_filter(sk);
657                 break;
658
659         case SO_PASSSEC:
660                 if (valbool)
661                         set_bit(SOCK_PASSSEC, &sock->flags);
662                 else
663                         clear_bit(SOCK_PASSSEC, &sock->flags);
664                 break;
665         case SO_MARK:
666                 if (!capable(CAP_NET_ADMIN))
667                         ret = -EPERM;
668                 else {
669                         sk->sk_mark = val;
670                 }
671                 break;
672
673                 /* We implement the SO_SNDLOWAT etc to
674                    not be settable (1003.1g 5.3) */
675         default:
676                 ret = -ENOPROTOOPT;
677                 break;
678         }
679         release_sock(sk);
680         return ret;
681 }
682
683
684 int sock_getsockopt(struct socket *sock, int level, int optname,
685                     char __user *optval, int __user *optlen)
686 {
687         struct sock *sk = sock->sk;
688
689         union {
690                 int val;
691                 struct linger ling;
692                 struct timeval tm;
693         } v;
694
695         unsigned int lv = sizeof(int);
696         int len;
697
698         if (get_user(len, optlen))
699                 return -EFAULT;
700         if (len < 0)
701                 return -EINVAL;
702
703         switch(optname) {
704         case SO_DEBUG:
705                 v.val = sock_flag(sk, SOCK_DBG);
706                 break;
707
708         case SO_DONTROUTE:
709                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
710                 break;
711
712         case SO_BROADCAST:
713                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
714                 break;
715
716         case SO_SNDBUF:
717                 v.val = sk->sk_sndbuf;
718                 break;
719
720         case SO_RCVBUF:
721                 v.val = sk->sk_rcvbuf;
722                 break;
723
724         case SO_REUSEADDR:
725                 v.val = sk->sk_reuse;
726                 break;
727
728         case SO_KEEPALIVE:
729                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
730                 break;
731
732         case SO_TYPE:
733                 v.val = sk->sk_type;
734                 break;
735
736         case SO_ERROR:
737                 v.val = -sock_error(sk);
738                 if (v.val==0)
739                         v.val = xchg(&sk->sk_err_soft, 0);
740                 break;
741
742         case SO_OOBINLINE:
743                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
744                 break;
745
746         case SO_NO_CHECK:
747                 v.val = sk->sk_no_check;
748                 break;
749
750         case SO_PRIORITY:
751                 v.val = sk->sk_priority;
752                 break;
753
754         case SO_LINGER:
755                 lv              = sizeof(v.ling);
756                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
757                 v.ling.l_linger = sk->sk_lingertime / HZ;
758                 break;
759
760         case SO_BSDCOMPAT:
761                 sock_warn_obsolete_bsdism("getsockopt");
762                 break;
763
764         case SO_TIMESTAMP:
765                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
766                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
767                 break;
768
769         case SO_TIMESTAMPNS:
770                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
771                 break;
772
773         case SO_RCVTIMEO:
774                 lv=sizeof(struct timeval);
775                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
776                         v.tm.tv_sec = 0;
777                         v.tm.tv_usec = 0;
778                 } else {
779                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
780                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
781                 }
782                 break;
783
784         case SO_SNDTIMEO:
785                 lv=sizeof(struct timeval);
786                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
787                         v.tm.tv_sec = 0;
788                         v.tm.tv_usec = 0;
789                 } else {
790                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
791                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
792                 }
793                 break;
794
795         case SO_RCVLOWAT:
796                 v.val = sk->sk_rcvlowat;
797                 break;
798
799         case SO_SNDLOWAT:
800                 v.val=1;
801                 break;
802
803         case SO_PASSCRED:
804                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
805                 break;
806
807         case SO_PEERCRED:
808                 if (len > sizeof(sk->sk_peercred))
809                         len = sizeof(sk->sk_peercred);
810                 if (copy_to_user(optval, &sk->sk_peercred, len))
811                         return -EFAULT;
812                 goto lenout;
813
814         case SO_PEERNAME:
815         {
816                 char address[128];
817
818                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
819                         return -ENOTCONN;
820                 if (lv < len)
821                         return -EINVAL;
822                 if (copy_to_user(optval, address, len))
823                         return -EFAULT;
824                 goto lenout;
825         }
826
827         /* Dubious BSD thing... Probably nobody even uses it, but
828          * the UNIX standard wants it for whatever reason... -DaveM
829          */
830         case SO_ACCEPTCONN:
831                 v.val = sk->sk_state == TCP_LISTEN;
832                 break;
833
834         case SO_PASSSEC:
835                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
836                 break;
837
838         case SO_PEERSEC:
839                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
840
841         case SO_MARK:
842                 v.val = sk->sk_mark;
843                 break;
844
845         default:
846                 return -ENOPROTOOPT;
847         }
848
849         if (len > lv)
850                 len = lv;
851         if (copy_to_user(optval, &v, len))
852                 return -EFAULT;
853 lenout:
854         if (put_user(len, optlen))
855                 return -EFAULT;
856         return 0;
857 }
858
859 /*
860  * Initialize an sk_lock.
861  *
862  * (We also register the sk_lock with the lock validator.)
863  */
864 static inline void sock_lock_init(struct sock *sk)
865 {
866         sock_lock_init_class_and_name(sk,
867                         af_family_slock_key_strings[sk->sk_family],
868                         af_family_slock_keys + sk->sk_family,
869                         af_family_key_strings[sk->sk_family],
870                         af_family_keys + sk->sk_family);
871 }
872
873 static void sock_copy(struct sock *nsk, const struct sock *osk)
874 {
875 #ifdef CONFIG_SECURITY_NETWORK
876         void *sptr = nsk->sk_security;
877 #endif
878
879         memcpy(nsk, osk, osk->sk_prot->obj_size);
880 #ifdef CONFIG_SECURITY_NETWORK
881         nsk->sk_security = sptr;
882         security_sk_clone(osk, nsk);
883 #endif
884 }
885
886 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
887                 int family)
888 {
889         struct sock *sk;
890         struct kmem_cache *slab;
891
892         slab = prot->slab;
893         if (slab != NULL)
894                 sk = kmem_cache_alloc(slab, priority);
895         else
896                 sk = kmalloc(prot->obj_size, priority);
897
898         if (sk != NULL) {
899                 if (security_sk_alloc(sk, family, priority))
900                         goto out_free;
901
902                 if (!try_module_get(prot->owner))
903                         goto out_free_sec;
904         }
905
906         return sk;
907
908 out_free_sec:
909         security_sk_free(sk);
910 out_free:
911         if (slab != NULL)
912                 kmem_cache_free(slab, sk);
913         else
914                 kfree(sk);
915         return NULL;
916 }
917
918 static void sk_prot_free(struct proto *prot, struct sock *sk)
919 {
920         struct kmem_cache *slab;
921         struct module *owner;
922
923         owner = prot->owner;
924         slab = prot->slab;
925
926         security_sk_free(sk);
927         if (slab != NULL)
928                 kmem_cache_free(slab, sk);
929         else
930                 kfree(sk);
931         module_put(owner);
932 }
933
934 /**
935  *      sk_alloc - All socket objects are allocated here
936  *      @net: the applicable net namespace
937  *      @family: protocol family
938  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
939  *      @prot: struct proto associated with this new sock instance
940  */
941 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
942                       struct proto *prot)
943 {
944         struct sock *sk;
945
946         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
947         if (sk) {
948                 sk->sk_family = family;
949                 /*
950                  * See comment in struct sock definition to understand
951                  * why we need sk_prot_creator -acme
952                  */
953                 sk->sk_prot = sk->sk_prot_creator = prot;
954                 sock_lock_init(sk);
955                 sock_net_set(sk, get_net(net));
956         }
957
958         return sk;
959 }
960
961 void sk_free(struct sock *sk)
962 {
963         struct sk_filter *filter;
964
965         if (sk->sk_destruct)
966                 sk->sk_destruct(sk);
967
968         filter = rcu_dereference(sk->sk_filter);
969         if (filter) {
970                 sk_filter_uncharge(sk, filter);
971                 rcu_assign_pointer(sk->sk_filter, NULL);
972         }
973
974         sock_disable_timestamp(sk);
975
976         if (atomic_read(&sk->sk_omem_alloc))
977                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
978                        __func__, atomic_read(&sk->sk_omem_alloc));
979
980         put_net(sock_net(sk));
981         sk_prot_free(sk->sk_prot_creator, sk);
982 }
983
984 /*
985  * Last sock_put should drop referrence to sk->sk_net. It has already
986  * been dropped in sk_change_net. Taking referrence to stopping namespace
987  * is not an option.
988  * Take referrence to a socket to remove it from hash _alive_ and after that
989  * destroy it in the context of init_net.
990  */
991 void sk_release_kernel(struct sock *sk)
992 {
993         if (sk == NULL || sk->sk_socket == NULL)
994                 return;
995
996         sock_hold(sk);
997         sock_release(sk->sk_socket);
998         release_net(sock_net(sk));
999         sock_net_set(sk, get_net(&init_net));
1000         sock_put(sk);
1001 }
1002 EXPORT_SYMBOL(sk_release_kernel);
1003
1004 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1005 {
1006         struct sock *newsk;
1007
1008         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1009         if (newsk != NULL) {
1010                 struct sk_filter *filter;
1011
1012                 sock_copy(newsk, sk);
1013
1014                 /* SANITY */
1015                 get_net(sock_net(newsk));
1016                 sk_node_init(&newsk->sk_node);
1017                 sock_lock_init(newsk);
1018                 bh_lock_sock(newsk);
1019                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1020
1021                 atomic_set(&newsk->sk_rmem_alloc, 0);
1022                 atomic_set(&newsk->sk_wmem_alloc, 0);
1023                 atomic_set(&newsk->sk_omem_alloc, 0);
1024                 skb_queue_head_init(&newsk->sk_receive_queue);
1025                 skb_queue_head_init(&newsk->sk_write_queue);
1026 #ifdef CONFIG_NET_DMA
1027                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1028 #endif
1029
1030                 rwlock_init(&newsk->sk_dst_lock);
1031                 rwlock_init(&newsk->sk_callback_lock);
1032                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1033                                 af_callback_keys + newsk->sk_family,
1034                                 af_family_clock_key_strings[newsk->sk_family]);
1035
1036                 newsk->sk_dst_cache     = NULL;
1037                 newsk->sk_wmem_queued   = 0;
1038                 newsk->sk_forward_alloc = 0;
1039                 newsk->sk_send_head     = NULL;
1040                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1041
1042                 sock_reset_flag(newsk, SOCK_DONE);
1043                 skb_queue_head_init(&newsk->sk_error_queue);
1044
1045                 filter = newsk->sk_filter;
1046                 if (filter != NULL)
1047                         sk_filter_charge(newsk, filter);
1048
1049                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1050                         /* It is still raw copy of parent, so invalidate
1051                          * destructor and make plain sk_free() */
1052                         newsk->sk_destruct = NULL;
1053                         sk_free(newsk);
1054                         newsk = NULL;
1055                         goto out;
1056                 }
1057
1058                 newsk->sk_err      = 0;
1059                 newsk->sk_priority = 0;
1060                 atomic_set(&newsk->sk_refcnt, 2);
1061
1062                 /*
1063                  * Increment the counter in the same struct proto as the master
1064                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1065                  * is the same as sk->sk_prot->socks, as this field was copied
1066                  * with memcpy).
1067                  *
1068                  * This _changes_ the previous behaviour, where
1069                  * tcp_create_openreq_child always was incrementing the
1070                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1071                  * to be taken into account in all callers. -acme
1072                  */
1073                 sk_refcnt_debug_inc(newsk);
1074                 sk_set_socket(newsk, NULL);
1075                 newsk->sk_sleep  = NULL;
1076
1077                 if (newsk->sk_prot->sockets_allocated)
1078                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1079         }
1080 out:
1081         return newsk;
1082 }
1083
1084 EXPORT_SYMBOL_GPL(sk_clone);
1085
1086 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1087 {
1088         __sk_dst_set(sk, dst);
1089         sk->sk_route_caps = dst->dev->features;
1090         if (sk->sk_route_caps & NETIF_F_GSO)
1091                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1092         if (sk_can_gso(sk)) {
1093                 if (dst->header_len) {
1094                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1095                 } else {
1096                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1097                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1098                 }
1099         }
1100 }
1101 EXPORT_SYMBOL_GPL(sk_setup_caps);
1102
1103 void __init sk_init(void)
1104 {
1105         if (num_physpages <= 4096) {
1106                 sysctl_wmem_max = 32767;
1107                 sysctl_rmem_max = 32767;
1108                 sysctl_wmem_default = 32767;
1109                 sysctl_rmem_default = 32767;
1110         } else if (num_physpages >= 131072) {
1111                 sysctl_wmem_max = 131071;
1112                 sysctl_rmem_max = 131071;
1113         }
1114 }
1115
1116 /*
1117  *      Simple resource managers for sockets.
1118  */
1119
1120
1121 /*
1122  * Write buffer destructor automatically called from kfree_skb.
1123  */
1124 void sock_wfree(struct sk_buff *skb)
1125 {
1126         struct sock *sk = skb->sk;
1127
1128         /* In case it might be waiting for more memory. */
1129         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1130         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1131                 sk->sk_write_space(sk);
1132         sock_put(sk);
1133 }
1134
1135 /*
1136  * Read buffer destructor automatically called from kfree_skb.
1137  */
1138 void sock_rfree(struct sk_buff *skb)
1139 {
1140         struct sock *sk = skb->sk;
1141
1142         skb_truesize_check(skb);
1143         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1144         sk_mem_uncharge(skb->sk, skb->truesize);
1145 }
1146
1147
1148 int sock_i_uid(struct sock *sk)
1149 {
1150         int uid;
1151
1152         read_lock(&sk->sk_callback_lock);
1153         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1154         read_unlock(&sk->sk_callback_lock);
1155         return uid;
1156 }
1157
1158 unsigned long sock_i_ino(struct sock *sk)
1159 {
1160         unsigned long ino;
1161
1162         read_lock(&sk->sk_callback_lock);
1163         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1164         read_unlock(&sk->sk_callback_lock);
1165         return ino;
1166 }
1167
1168 /*
1169  * Allocate a skb from the socket's send buffer.
1170  */
1171 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1172                              gfp_t priority)
1173 {
1174         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1175                 struct sk_buff * skb = alloc_skb(size, priority);
1176                 if (skb) {
1177                         skb_set_owner_w(skb, sk);
1178                         return skb;
1179                 }
1180         }
1181         return NULL;
1182 }
1183
1184 /*
1185  * Allocate a skb from the socket's receive buffer.
1186  */
1187 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1188                              gfp_t priority)
1189 {
1190         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1191                 struct sk_buff *skb = alloc_skb(size, priority);
1192                 if (skb) {
1193                         skb_set_owner_r(skb, sk);
1194                         return skb;
1195                 }
1196         }
1197         return NULL;
1198 }
1199
1200 /*
1201  * Allocate a memory block from the socket's option memory buffer.
1202  */
1203 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1204 {
1205         if ((unsigned)size <= sysctl_optmem_max &&
1206             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1207                 void *mem;
1208                 /* First do the add, to avoid the race if kmalloc
1209                  * might sleep.
1210                  */
1211                 atomic_add(size, &sk->sk_omem_alloc);
1212                 mem = kmalloc(size, priority);
1213                 if (mem)
1214                         return mem;
1215                 atomic_sub(size, &sk->sk_omem_alloc);
1216         }
1217         return NULL;
1218 }
1219
1220 /*
1221  * Free an option memory block.
1222  */
1223 void sock_kfree_s(struct sock *sk, void *mem, int size)
1224 {
1225         kfree(mem);
1226         atomic_sub(size, &sk->sk_omem_alloc);
1227 }
1228
1229 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1230    I think, these locks should be removed for datagram sockets.
1231  */
1232 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1233 {
1234         DEFINE_WAIT(wait);
1235
1236         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1237         for (;;) {
1238                 if (!timeo)
1239                         break;
1240                 if (signal_pending(current))
1241                         break;
1242                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1243                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1244                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1245                         break;
1246                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1247                         break;
1248                 if (sk->sk_err)
1249                         break;
1250                 timeo = schedule_timeout(timeo);
1251         }
1252         finish_wait(sk->sk_sleep, &wait);
1253         return timeo;
1254 }
1255
1256
1257 /*
1258  *      Generic send/receive buffer handlers
1259  */
1260
1261 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1262                                             unsigned long header_len,
1263                                             unsigned long data_len,
1264                                             int noblock, int *errcode)
1265 {
1266         struct sk_buff *skb;
1267         gfp_t gfp_mask;
1268         long timeo;
1269         int err;
1270
1271         gfp_mask = sk->sk_allocation;
1272         if (gfp_mask & __GFP_WAIT)
1273                 gfp_mask |= __GFP_REPEAT;
1274
1275         timeo = sock_sndtimeo(sk, noblock);
1276         while (1) {
1277                 err = sock_error(sk);
1278                 if (err != 0)
1279                         goto failure;
1280
1281                 err = -EPIPE;
1282                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1283                         goto failure;
1284
1285                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1286                         skb = alloc_skb(header_len, gfp_mask);
1287                         if (skb) {
1288                                 int npages;
1289                                 int i;
1290
1291                                 /* No pages, we're done... */
1292                                 if (!data_len)
1293                                         break;
1294
1295                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1296                                 skb->truesize += data_len;
1297                                 skb_shinfo(skb)->nr_frags = npages;
1298                                 for (i = 0; i < npages; i++) {
1299                                         struct page *page;
1300                                         skb_frag_t *frag;
1301
1302                                         page = alloc_pages(sk->sk_allocation, 0);
1303                                         if (!page) {
1304                                                 err = -ENOBUFS;
1305                                                 skb_shinfo(skb)->nr_frags = i;
1306                                                 kfree_skb(skb);
1307                                                 goto failure;
1308                                         }
1309
1310                                         frag = &skb_shinfo(skb)->frags[i];
1311                                         frag->page = page;
1312                                         frag->page_offset = 0;
1313                                         frag->size = (data_len >= PAGE_SIZE ?
1314                                                       PAGE_SIZE :
1315                                                       data_len);
1316                                         data_len -= PAGE_SIZE;
1317                                 }
1318
1319                                 /* Full success... */
1320                                 break;
1321                         }
1322                         err = -ENOBUFS;
1323                         goto failure;
1324                 }
1325                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1326                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1327                 err = -EAGAIN;
1328                 if (!timeo)
1329                         goto failure;
1330                 if (signal_pending(current))
1331                         goto interrupted;
1332                 timeo = sock_wait_for_wmem(sk, timeo);
1333         }
1334
1335         skb_set_owner_w(skb, sk);
1336         return skb;
1337
1338 interrupted:
1339         err = sock_intr_errno(timeo);
1340 failure:
1341         *errcode = err;
1342         return NULL;
1343 }
1344
1345 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1346                                     int noblock, int *errcode)
1347 {
1348         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1349 }
1350
1351 static void __lock_sock(struct sock *sk)
1352 {
1353         DEFINE_WAIT(wait);
1354
1355         for (;;) {
1356                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1357                                         TASK_UNINTERRUPTIBLE);
1358                 spin_unlock_bh(&sk->sk_lock.slock);
1359                 schedule();
1360                 spin_lock_bh(&sk->sk_lock.slock);
1361                 if (!sock_owned_by_user(sk))
1362                         break;
1363         }
1364         finish_wait(&sk->sk_lock.wq, &wait);
1365 }
1366
1367 static void __release_sock(struct sock *sk)
1368 {
1369         struct sk_buff *skb = sk->sk_backlog.head;
1370
1371         do {
1372                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1373                 bh_unlock_sock(sk);
1374
1375                 do {
1376                         struct sk_buff *next = skb->next;
1377
1378                         skb->next = NULL;
1379                         sk_backlog_rcv(sk, skb);
1380
1381                         /*
1382                          * We are in process context here with softirqs
1383                          * disabled, use cond_resched_softirq() to preempt.
1384                          * This is safe to do because we've taken the backlog
1385                          * queue private:
1386                          */
1387                         cond_resched_softirq();
1388
1389                         skb = next;
1390                 } while (skb != NULL);
1391
1392                 bh_lock_sock(sk);
1393         } while ((skb = sk->sk_backlog.head) != NULL);
1394 }
1395
1396 /**
1397  * sk_wait_data - wait for data to arrive at sk_receive_queue
1398  * @sk:    sock to wait on
1399  * @timeo: for how long
1400  *
1401  * Now socket state including sk->sk_err is changed only under lock,
1402  * hence we may omit checks after joining wait queue.
1403  * We check receive queue before schedule() only as optimization;
1404  * it is very likely that release_sock() added new data.
1405  */
1406 int sk_wait_data(struct sock *sk, long *timeo)
1407 {
1408         int rc;
1409         DEFINE_WAIT(wait);
1410
1411         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1412         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1413         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1414         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1415         finish_wait(sk->sk_sleep, &wait);
1416         return rc;
1417 }
1418
1419 EXPORT_SYMBOL(sk_wait_data);
1420
1421 /**
1422  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1423  *      @sk: socket
1424  *      @size: memory size to allocate
1425  *      @kind: allocation type
1426  *
1427  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1428  *      rmem allocation. This function assumes that protocols which have
1429  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1430  */
1431 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1432 {
1433         struct proto *prot = sk->sk_prot;
1434         int amt = sk_mem_pages(size);
1435         int allocated;
1436
1437         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1438         allocated = atomic_add_return(amt, prot->memory_allocated);
1439
1440         /* Under limit. */
1441         if (allocated <= prot->sysctl_mem[0]) {
1442                 if (prot->memory_pressure && *prot->memory_pressure)
1443                         *prot->memory_pressure = 0;
1444                 return 1;
1445         }
1446
1447         /* Under pressure. */
1448         if (allocated > prot->sysctl_mem[1])
1449                 if (prot->enter_memory_pressure)
1450                         prot->enter_memory_pressure(sk);
1451
1452         /* Over hard limit. */
1453         if (allocated > prot->sysctl_mem[2])
1454                 goto suppress_allocation;
1455
1456         /* guarantee minimum buffer size under pressure */
1457         if (kind == SK_MEM_RECV) {
1458                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1459                         return 1;
1460         } else { /* SK_MEM_SEND */
1461                 if (sk->sk_type == SOCK_STREAM) {
1462                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1463                                 return 1;
1464                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1465                            prot->sysctl_wmem[0])
1466                                 return 1;
1467         }
1468
1469         if (prot->memory_pressure) {
1470                 int alloc;
1471
1472                 if (!*prot->memory_pressure)
1473                         return 1;
1474                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1475                 if (prot->sysctl_mem[2] > alloc *
1476                     sk_mem_pages(sk->sk_wmem_queued +
1477                                  atomic_read(&sk->sk_rmem_alloc) +
1478                                  sk->sk_forward_alloc))
1479                         return 1;
1480         }
1481
1482 suppress_allocation:
1483
1484         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1485                 sk_stream_moderate_sndbuf(sk);
1486
1487                 /* Fail only if socket is _under_ its sndbuf.
1488                  * In this case we cannot block, so that we have to fail.
1489                  */
1490                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1491                         return 1;
1492         }
1493
1494         /* Alas. Undo changes. */
1495         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1496         atomic_sub(amt, prot->memory_allocated);
1497         return 0;
1498 }
1499
1500 EXPORT_SYMBOL(__sk_mem_schedule);
1501
1502 /**
1503  *      __sk_reclaim - reclaim memory_allocated
1504  *      @sk: socket
1505  */
1506 void __sk_mem_reclaim(struct sock *sk)
1507 {
1508         struct proto *prot = sk->sk_prot;
1509
1510         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1511                    prot->memory_allocated);
1512         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1513
1514         if (prot->memory_pressure && *prot->memory_pressure &&
1515             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1516                 *prot->memory_pressure = 0;
1517 }
1518
1519 EXPORT_SYMBOL(__sk_mem_reclaim);
1520
1521
1522 /*
1523  * Set of default routines for initialising struct proto_ops when
1524  * the protocol does not support a particular function. In certain
1525  * cases where it makes no sense for a protocol to have a "do nothing"
1526  * function, some default processing is provided.
1527  */
1528
1529 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1530 {
1531         return -EOPNOTSUPP;
1532 }
1533
1534 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1535                     int len, int flags)
1536 {
1537         return -EOPNOTSUPP;
1538 }
1539
1540 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1541 {
1542         return -EOPNOTSUPP;
1543 }
1544
1545 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1546 {
1547         return -EOPNOTSUPP;
1548 }
1549
1550 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1551                     int *len, int peer)
1552 {
1553         return -EOPNOTSUPP;
1554 }
1555
1556 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1557 {
1558         return 0;
1559 }
1560
1561 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1562 {
1563         return -EOPNOTSUPP;
1564 }
1565
1566 int sock_no_listen(struct socket *sock, int backlog)
1567 {
1568         return -EOPNOTSUPP;
1569 }
1570
1571 int sock_no_shutdown(struct socket *sock, int how)
1572 {
1573         return -EOPNOTSUPP;
1574 }
1575
1576 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1577                     char __user *optval, int optlen)
1578 {
1579         return -EOPNOTSUPP;
1580 }
1581
1582 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1583                     char __user *optval, int __user *optlen)
1584 {
1585         return -EOPNOTSUPP;
1586 }
1587
1588 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1589                     size_t len)
1590 {
1591         return -EOPNOTSUPP;
1592 }
1593
1594 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1595                     size_t len, int flags)
1596 {
1597         return -EOPNOTSUPP;
1598 }
1599
1600 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1601 {
1602         /* Mirror missing mmap method error code */
1603         return -ENODEV;
1604 }
1605
1606 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1607 {
1608         ssize_t res;
1609         struct msghdr msg = {.msg_flags = flags};
1610         struct kvec iov;
1611         char *kaddr = kmap(page);
1612         iov.iov_base = kaddr + offset;
1613         iov.iov_len = size;
1614         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1615         kunmap(page);
1616         return res;
1617 }
1618
1619 /*
1620  *      Default Socket Callbacks
1621  */
1622
1623 static void sock_def_wakeup(struct sock *sk)
1624 {
1625         read_lock(&sk->sk_callback_lock);
1626         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1627                 wake_up_interruptible_all(sk->sk_sleep);
1628         read_unlock(&sk->sk_callback_lock);
1629 }
1630
1631 static void sock_def_error_report(struct sock *sk)
1632 {
1633         read_lock(&sk->sk_callback_lock);
1634         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1635                 wake_up_interruptible(sk->sk_sleep);
1636         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1637         read_unlock(&sk->sk_callback_lock);
1638 }
1639
1640 static void sock_def_readable(struct sock *sk, int len)
1641 {
1642         read_lock(&sk->sk_callback_lock);
1643         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1644                 wake_up_interruptible_sync(sk->sk_sleep);
1645         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1646         read_unlock(&sk->sk_callback_lock);
1647 }
1648
1649 static void sock_def_write_space(struct sock *sk)
1650 {
1651         read_lock(&sk->sk_callback_lock);
1652
1653         /* Do not wake up a writer until he can make "significant"
1654          * progress.  --DaveM
1655          */
1656         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1657                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1658                         wake_up_interruptible_sync(sk->sk_sleep);
1659
1660                 /* Should agree with poll, otherwise some programs break */
1661                 if (sock_writeable(sk))
1662                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1663         }
1664
1665         read_unlock(&sk->sk_callback_lock);
1666 }
1667
1668 static void sock_def_destruct(struct sock *sk)
1669 {
1670         kfree(sk->sk_protinfo);
1671 }
1672
1673 void sk_send_sigurg(struct sock *sk)
1674 {
1675         if (sk->sk_socket && sk->sk_socket->file)
1676                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1677                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1678 }
1679
1680 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1681                     unsigned long expires)
1682 {
1683         if (!mod_timer(timer, expires))
1684                 sock_hold(sk);
1685 }
1686
1687 EXPORT_SYMBOL(sk_reset_timer);
1688
1689 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1690 {
1691         if (timer_pending(timer) && del_timer(timer))
1692                 __sock_put(sk);
1693 }
1694
1695 EXPORT_SYMBOL(sk_stop_timer);
1696
1697 void sock_init_data(struct socket *sock, struct sock *sk)
1698 {
1699         skb_queue_head_init(&sk->sk_receive_queue);
1700         skb_queue_head_init(&sk->sk_write_queue);
1701         skb_queue_head_init(&sk->sk_error_queue);
1702 #ifdef CONFIG_NET_DMA
1703         skb_queue_head_init(&sk->sk_async_wait_queue);
1704 #endif
1705
1706         sk->sk_send_head        =       NULL;
1707
1708         init_timer(&sk->sk_timer);
1709
1710         sk->sk_allocation       =       GFP_KERNEL;
1711         sk->sk_rcvbuf           =       sysctl_rmem_default;
1712         sk->sk_sndbuf           =       sysctl_wmem_default;
1713         sk->sk_state            =       TCP_CLOSE;
1714         sk_set_socket(sk, sock);
1715
1716         sock_set_flag(sk, SOCK_ZAPPED);
1717
1718         if (sock) {
1719                 sk->sk_type     =       sock->type;
1720                 sk->sk_sleep    =       &sock->wait;
1721                 sock->sk        =       sk;
1722         } else
1723                 sk->sk_sleep    =       NULL;
1724
1725         rwlock_init(&sk->sk_dst_lock);
1726         rwlock_init(&sk->sk_callback_lock);
1727         lockdep_set_class_and_name(&sk->sk_callback_lock,
1728                         af_callback_keys + sk->sk_family,
1729                         af_family_clock_key_strings[sk->sk_family]);
1730
1731         sk->sk_state_change     =       sock_def_wakeup;
1732         sk->sk_data_ready       =       sock_def_readable;
1733         sk->sk_write_space      =       sock_def_write_space;
1734         sk->sk_error_report     =       sock_def_error_report;
1735         sk->sk_destruct         =       sock_def_destruct;
1736
1737         sk->sk_sndmsg_page      =       NULL;
1738         sk->sk_sndmsg_off       =       0;
1739
1740         sk->sk_peercred.pid     =       0;
1741         sk->sk_peercred.uid     =       -1;
1742         sk->sk_peercred.gid     =       -1;
1743         sk->sk_write_pending    =       0;
1744         sk->sk_rcvlowat         =       1;
1745         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1746         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1747
1748         sk->sk_stamp = ktime_set(-1L, 0);
1749
1750         atomic_set(&sk->sk_refcnt, 1);
1751         atomic_set(&sk->sk_drops, 0);
1752 }
1753
1754 void lock_sock_nested(struct sock *sk, int subclass)
1755 {
1756         might_sleep();
1757         spin_lock_bh(&sk->sk_lock.slock);
1758         if (sk->sk_lock.owned)
1759                 __lock_sock(sk);
1760         sk->sk_lock.owned = 1;
1761         spin_unlock(&sk->sk_lock.slock);
1762         /*
1763          * The sk_lock has mutex_lock() semantics here:
1764          */
1765         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1766         local_bh_enable();
1767 }
1768
1769 EXPORT_SYMBOL(lock_sock_nested);
1770
1771 void release_sock(struct sock *sk)
1772 {
1773         /*
1774          * The sk_lock has mutex_unlock() semantics:
1775          */
1776         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1777
1778         spin_lock_bh(&sk->sk_lock.slock);
1779         if (sk->sk_backlog.tail)
1780                 __release_sock(sk);
1781         sk->sk_lock.owned = 0;
1782         if (waitqueue_active(&sk->sk_lock.wq))
1783                 wake_up(&sk->sk_lock.wq);
1784         spin_unlock_bh(&sk->sk_lock.slock);
1785 }
1786 EXPORT_SYMBOL(release_sock);
1787
1788 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1789 {
1790         struct timeval tv;
1791         if (!sock_flag(sk, SOCK_TIMESTAMP))
1792                 sock_enable_timestamp(sk);
1793         tv = ktime_to_timeval(sk->sk_stamp);
1794         if (tv.tv_sec == -1)
1795                 return -ENOENT;
1796         if (tv.tv_sec == 0) {
1797                 sk->sk_stamp = ktime_get_real();
1798                 tv = ktime_to_timeval(sk->sk_stamp);
1799         }
1800         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1801 }
1802 EXPORT_SYMBOL(sock_get_timestamp);
1803
1804 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1805 {
1806         struct timespec ts;
1807         if (!sock_flag(sk, SOCK_TIMESTAMP))
1808                 sock_enable_timestamp(sk);
1809         ts = ktime_to_timespec(sk->sk_stamp);
1810         if (ts.tv_sec == -1)
1811                 return -ENOENT;
1812         if (ts.tv_sec == 0) {
1813                 sk->sk_stamp = ktime_get_real();
1814                 ts = ktime_to_timespec(sk->sk_stamp);
1815         }
1816         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1817 }
1818 EXPORT_SYMBOL(sock_get_timestampns);
1819
1820 void sock_enable_timestamp(struct sock *sk)
1821 {
1822         if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1823                 sock_set_flag(sk, SOCK_TIMESTAMP);
1824                 net_enable_timestamp();
1825         }
1826 }
1827
1828 /*
1829  *      Get a socket option on an socket.
1830  *
1831  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1832  *      asynchronous errors should be reported by getsockopt. We assume
1833  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1834  */
1835 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1836                            char __user *optval, int __user *optlen)
1837 {
1838         struct sock *sk = sock->sk;
1839
1840         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1841 }
1842
1843 EXPORT_SYMBOL(sock_common_getsockopt);
1844
1845 #ifdef CONFIG_COMPAT
1846 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1847                                   char __user *optval, int __user *optlen)
1848 {
1849         struct sock *sk = sock->sk;
1850
1851         if (sk->sk_prot->compat_getsockopt != NULL)
1852                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1853                                                       optval, optlen);
1854         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1855 }
1856 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1857 #endif
1858
1859 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1860                         struct msghdr *msg, size_t size, int flags)
1861 {
1862         struct sock *sk = sock->sk;
1863         int addr_len = 0;
1864         int err;
1865
1866         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1867                                    flags & ~MSG_DONTWAIT, &addr_len);
1868         if (err >= 0)
1869                 msg->msg_namelen = addr_len;
1870         return err;
1871 }
1872
1873 EXPORT_SYMBOL(sock_common_recvmsg);
1874
1875 /*
1876  *      Set socket options on an inet socket.
1877  */
1878 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1879                            char __user *optval, int optlen)
1880 {
1881         struct sock *sk = sock->sk;
1882
1883         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1884 }
1885
1886 EXPORT_SYMBOL(sock_common_setsockopt);
1887
1888 #ifdef CONFIG_COMPAT
1889 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1890                                   char __user *optval, int optlen)
1891 {
1892         struct sock *sk = sock->sk;
1893
1894         if (sk->sk_prot->compat_setsockopt != NULL)
1895                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1896                                                       optval, optlen);
1897         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1898 }
1899 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1900 #endif
1901
1902 void sk_common_release(struct sock *sk)
1903 {
1904         if (sk->sk_prot->destroy)
1905                 sk->sk_prot->destroy(sk);
1906
1907         /*
1908          * Observation: when sock_common_release is called, processes have
1909          * no access to socket. But net still has.
1910          * Step one, detach it from networking:
1911          *
1912          * A. Remove from hash tables.
1913          */
1914
1915         sk->sk_prot->unhash(sk);
1916
1917         /*
1918          * In this point socket cannot receive new packets, but it is possible
1919          * that some packets are in flight because some CPU runs receiver and
1920          * did hash table lookup before we unhashed socket. They will achieve
1921          * receive queue and will be purged by socket destructor.
1922          *
1923          * Also we still have packets pending on receive queue and probably,
1924          * our own packets waiting in device queues. sock_destroy will drain
1925          * receive queue, but transmitted packets will delay socket destruction
1926          * until the last reference will be released.
1927          */
1928
1929         sock_orphan(sk);
1930
1931         xfrm_sk_free_policy(sk);
1932
1933         sk_refcnt_debug_release(sk);
1934         sock_put(sk);
1935 }
1936
1937 EXPORT_SYMBOL(sk_common_release);
1938
1939 static DEFINE_RWLOCK(proto_list_lock);
1940 static LIST_HEAD(proto_list);
1941
1942 #ifdef CONFIG_PROC_FS
1943 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
1944 struct prot_inuse {
1945         int val[PROTO_INUSE_NR];
1946 };
1947
1948 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1949
1950 #ifdef CONFIG_NET_NS
1951 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1952 {
1953         int cpu = smp_processor_id();
1954         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1955 }
1956 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1957
1958 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1959 {
1960         int cpu, idx = prot->inuse_idx;
1961         int res = 0;
1962
1963         for_each_possible_cpu(cpu)
1964                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1965
1966         return res >= 0 ? res : 0;
1967 }
1968 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1969
1970 static int sock_inuse_init_net(struct net *net)
1971 {
1972         net->core.inuse = alloc_percpu(struct prot_inuse);
1973         return net->core.inuse ? 0 : -ENOMEM;
1974 }
1975
1976 static void sock_inuse_exit_net(struct net *net)
1977 {
1978         free_percpu(net->core.inuse);
1979 }
1980
1981 static struct pernet_operations net_inuse_ops = {
1982         .init = sock_inuse_init_net,
1983         .exit = sock_inuse_exit_net,
1984 };
1985
1986 static __init int net_inuse_init(void)
1987 {
1988         if (register_pernet_subsys(&net_inuse_ops))
1989                 panic("Cannot initialize net inuse counters");
1990
1991         return 0;
1992 }
1993
1994 core_initcall(net_inuse_init);
1995 #else
1996 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1997
1998 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1999 {
2000         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2001 }
2002 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2003
2004 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2005 {
2006         int cpu, idx = prot->inuse_idx;
2007         int res = 0;
2008
2009         for_each_possible_cpu(cpu)
2010                 res += per_cpu(prot_inuse, cpu).val[idx];
2011
2012         return res >= 0 ? res : 0;
2013 }
2014 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2015 #endif
2016
2017 static void assign_proto_idx(struct proto *prot)
2018 {
2019         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2020
2021         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2022                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2023                 return;
2024         }
2025
2026         set_bit(prot->inuse_idx, proto_inuse_idx);
2027 }
2028
2029 static void release_proto_idx(struct proto *prot)
2030 {
2031         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2032                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2033 }
2034 #else
2035 static inline void assign_proto_idx(struct proto *prot)
2036 {
2037 }
2038
2039 static inline void release_proto_idx(struct proto *prot)
2040 {
2041 }
2042 #endif
2043
2044 int proto_register(struct proto *prot, int alloc_slab)
2045 {
2046         if (alloc_slab) {
2047                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2048                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2049                                         NULL);
2050
2051                 if (prot->slab == NULL) {
2052                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2053                                prot->name);
2054                         goto out;
2055                 }
2056
2057                 if (prot->rsk_prot != NULL) {
2058                         static const char mask[] = "request_sock_%s";
2059
2060                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2061                         if (prot->rsk_prot->slab_name == NULL)
2062                                 goto out_free_sock_slab;
2063
2064                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2065                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2066                                                                  prot->rsk_prot->obj_size, 0,
2067                                                                  SLAB_HWCACHE_ALIGN, NULL);
2068
2069                         if (prot->rsk_prot->slab == NULL) {
2070                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2071                                        prot->name);
2072                                 goto out_free_request_sock_slab_name;
2073                         }
2074                 }
2075
2076                 if (prot->twsk_prot != NULL) {
2077                         static const char mask[] = "tw_sock_%s";
2078
2079                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2080
2081                         if (prot->twsk_prot->twsk_slab_name == NULL)
2082                                 goto out_free_request_sock_slab;
2083
2084                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2085                         prot->twsk_prot->twsk_slab =
2086                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2087                                                   prot->twsk_prot->twsk_obj_size,
2088                                                   0,
2089                                                   SLAB_HWCACHE_ALIGN |
2090                                                         prot->slab_flags,
2091                                                   NULL);
2092                         if (prot->twsk_prot->twsk_slab == NULL)
2093                                 goto out_free_timewait_sock_slab_name;
2094                 }
2095         }
2096
2097         write_lock(&proto_list_lock);
2098         list_add(&prot->node, &proto_list);
2099         assign_proto_idx(prot);
2100         write_unlock(&proto_list_lock);
2101         return 0;
2102
2103 out_free_timewait_sock_slab_name:
2104         kfree(prot->twsk_prot->twsk_slab_name);
2105 out_free_request_sock_slab:
2106         if (prot->rsk_prot && prot->rsk_prot->slab) {
2107                 kmem_cache_destroy(prot->rsk_prot->slab);
2108                 prot->rsk_prot->slab = NULL;
2109         }
2110 out_free_request_sock_slab_name:
2111         kfree(prot->rsk_prot->slab_name);
2112 out_free_sock_slab:
2113         kmem_cache_destroy(prot->slab);
2114         prot->slab = NULL;
2115 out:
2116         return -ENOBUFS;
2117 }
2118
2119 EXPORT_SYMBOL(proto_register);
2120
2121 void proto_unregister(struct proto *prot)
2122 {
2123         write_lock(&proto_list_lock);
2124         release_proto_idx(prot);
2125         list_del(&prot->node);
2126         write_unlock(&proto_list_lock);
2127
2128         if (prot->slab != NULL) {
2129                 kmem_cache_destroy(prot->slab);
2130                 prot->slab = NULL;
2131         }
2132
2133         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2134                 kmem_cache_destroy(prot->rsk_prot->slab);
2135                 kfree(prot->rsk_prot->slab_name);
2136                 prot->rsk_prot->slab = NULL;
2137         }
2138
2139         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2140                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2141                 kfree(prot->twsk_prot->twsk_slab_name);
2142                 prot->twsk_prot->twsk_slab = NULL;
2143         }
2144 }
2145
2146 EXPORT_SYMBOL(proto_unregister);
2147
2148 #ifdef CONFIG_PROC_FS
2149 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2150         __acquires(proto_list_lock)
2151 {
2152         read_lock(&proto_list_lock);
2153         return seq_list_start_head(&proto_list, *pos);
2154 }
2155
2156 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2157 {
2158         return seq_list_next(v, &proto_list, pos);
2159 }
2160
2161 static void proto_seq_stop(struct seq_file *seq, void *v)
2162         __releases(proto_list_lock)
2163 {
2164         read_unlock(&proto_list_lock);
2165 }
2166
2167 static char proto_method_implemented(const void *method)
2168 {
2169         return method == NULL ? 'n' : 'y';
2170 }
2171
2172 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2173 {
2174         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2175                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2176                    proto->name,
2177                    proto->obj_size,
2178                    sock_prot_inuse_get(seq_file_net(seq), proto),
2179                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2180                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2181                    proto->max_header,
2182                    proto->slab == NULL ? "no" : "yes",
2183                    module_name(proto->owner),
2184                    proto_method_implemented(proto->close),
2185                    proto_method_implemented(proto->connect),
2186                    proto_method_implemented(proto->disconnect),
2187                    proto_method_implemented(proto->accept),
2188                    proto_method_implemented(proto->ioctl),
2189                    proto_method_implemented(proto->init),
2190                    proto_method_implemented(proto->destroy),
2191                    proto_method_implemented(proto->shutdown),
2192                    proto_method_implemented(proto->setsockopt),
2193                    proto_method_implemented(proto->getsockopt),
2194                    proto_method_implemented(proto->sendmsg),
2195                    proto_method_implemented(proto->recvmsg),
2196                    proto_method_implemented(proto->sendpage),
2197                    proto_method_implemented(proto->bind),
2198                    proto_method_implemented(proto->backlog_rcv),
2199                    proto_method_implemented(proto->hash),
2200                    proto_method_implemented(proto->unhash),
2201                    proto_method_implemented(proto->get_port),
2202                    proto_method_implemented(proto->enter_memory_pressure));
2203 }
2204
2205 static int proto_seq_show(struct seq_file *seq, void *v)
2206 {
2207         if (v == &proto_list)
2208                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2209                            "protocol",
2210                            "size",
2211                            "sockets",
2212                            "memory",
2213                            "press",
2214                            "maxhdr",
2215                            "slab",
2216                            "module",
2217                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2218         else
2219                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2220         return 0;
2221 }
2222
2223 static const struct seq_operations proto_seq_ops = {
2224         .start  = proto_seq_start,
2225         .next   = proto_seq_next,
2226         .stop   = proto_seq_stop,
2227         .show   = proto_seq_show,
2228 };
2229
2230 static int proto_seq_open(struct inode *inode, struct file *file)
2231 {
2232         return seq_open_net(inode, file, &proto_seq_ops,
2233                             sizeof(struct seq_net_private));
2234 }
2235
2236 static const struct file_operations proto_seq_fops = {
2237         .owner          = THIS_MODULE,
2238         .open           = proto_seq_open,
2239         .read           = seq_read,
2240         .llseek         = seq_lseek,
2241         .release        = seq_release_net,
2242 };
2243
2244 static __net_init int proto_init_net(struct net *net)
2245 {
2246         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2247                 return -ENOMEM;
2248
2249         return 0;
2250 }
2251
2252 static __net_exit void proto_exit_net(struct net *net)
2253 {
2254         proc_net_remove(net, "protocols");
2255 }
2256
2257
2258 static __net_initdata struct pernet_operations proto_net_ops = {
2259         .init = proto_init_net,
2260         .exit = proto_exit_net,
2261 };
2262
2263 static int __init proto_init(void)
2264 {
2265         return register_pernet_subsys(&proto_net_ops);
2266 }
2267
2268 subsys_initcall(proto_init);
2269
2270 #endif /* PROC_FS */
2271
2272 EXPORT_SYMBOL(sk_alloc);
2273 EXPORT_SYMBOL(sk_free);
2274 EXPORT_SYMBOL(sk_send_sigurg);
2275 EXPORT_SYMBOL(sock_alloc_send_skb);
2276 EXPORT_SYMBOL(sock_init_data);
2277 EXPORT_SYMBOL(sock_kfree_s);
2278 EXPORT_SYMBOL(sock_kmalloc);
2279 EXPORT_SYMBOL(sock_no_accept);
2280 EXPORT_SYMBOL(sock_no_bind);
2281 EXPORT_SYMBOL(sock_no_connect);
2282 EXPORT_SYMBOL(sock_no_getname);
2283 EXPORT_SYMBOL(sock_no_getsockopt);
2284 EXPORT_SYMBOL(sock_no_ioctl);
2285 EXPORT_SYMBOL(sock_no_listen);
2286 EXPORT_SYMBOL(sock_no_mmap);
2287 EXPORT_SYMBOL(sock_no_poll);
2288 EXPORT_SYMBOL(sock_no_recvmsg);
2289 EXPORT_SYMBOL(sock_no_sendmsg);
2290 EXPORT_SYMBOL(sock_no_sendpage);
2291 EXPORT_SYMBOL(sock_no_setsockopt);
2292 EXPORT_SYMBOL(sock_no_shutdown);
2293 EXPORT_SYMBOL(sock_no_socketpair);
2294 EXPORT_SYMBOL(sock_rfree);
2295 EXPORT_SYMBOL(sock_setsockopt);
2296 EXPORT_SYMBOL(sock_wfree);
2297 EXPORT_SYMBOL(sock_wmalloc);
2298 EXPORT_SYMBOL(sock_i_uid);
2299 EXPORT_SYMBOL(sock_i_ino);
2300 EXPORT_SYMBOL(sysctl_optmem_max);