IB/mthca: Optimize large messages on Sinai HCAs
[linux-2.6] / net / dccp / proto.c
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *      This program is free software; you can redistribute it and/or modify it
8  *      under the terms of the GNU General Public License version 2 as
9  *      published by the Free Software Foundation.
10  */
11
12 #include <linux/config.h>
13 #include <linux/dccp.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/if_arp.h>
22 #include <linux/init.h>
23 #include <linux/random.h>
24 #include <net/checksum.h>
25
26 #include <net/inet_common.h>
27 #include <net/inet_sock.h>
28 #include <net/protocol.h>
29 #include <net/sock.h>
30 #include <net/xfrm.h>
31
32 #include <asm/semaphore.h>
33 #include <linux/spinlock.h>
34 #include <linux/timer.h>
35 #include <linux/delay.h>
36 #include <linux/poll.h>
37
38 #include "ccid.h"
39 #include "dccp.h"
40
41 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
42
43 EXPORT_SYMBOL_GPL(dccp_statistics);
44
45 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
46
47 EXPORT_SYMBOL_GPL(dccp_orphan_count);
48
49 static struct net_protocol dccp_protocol = {
50         .handler        = dccp_v4_rcv,
51         .err_handler    = dccp_v4_err,
52         .no_policy      = 1,
53 };
54
55 const char *dccp_packet_name(const int type)
56 {
57         static const char *dccp_packet_names[] = {
58                 [DCCP_PKT_REQUEST]  = "REQUEST",
59                 [DCCP_PKT_RESPONSE] = "RESPONSE",
60                 [DCCP_PKT_DATA]     = "DATA",
61                 [DCCP_PKT_ACK]      = "ACK",
62                 [DCCP_PKT_DATAACK]  = "DATAACK",
63                 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
64                 [DCCP_PKT_CLOSE]    = "CLOSE",
65                 [DCCP_PKT_RESET]    = "RESET",
66                 [DCCP_PKT_SYNC]     = "SYNC",
67                 [DCCP_PKT_SYNCACK]  = "SYNCACK",
68         };
69
70         if (type >= DCCP_NR_PKT_TYPES)
71                 return "INVALID";
72         else
73                 return dccp_packet_names[type];
74 }
75
76 EXPORT_SYMBOL_GPL(dccp_packet_name);
77
78 const char *dccp_state_name(const int state)
79 {
80         static char *dccp_state_names[] = {
81         [DCCP_OPEN]       = "OPEN",
82         [DCCP_REQUESTING] = "REQUESTING",
83         [DCCP_PARTOPEN]   = "PARTOPEN",
84         [DCCP_LISTEN]     = "LISTEN",
85         [DCCP_RESPOND]    = "RESPOND",
86         [DCCP_CLOSING]    = "CLOSING",
87         [DCCP_TIME_WAIT]  = "TIME_WAIT",
88         [DCCP_CLOSED]     = "CLOSED",
89         };
90
91         if (state >= DCCP_MAX_STATES)
92                 return "INVALID STATE!";
93         else
94                 return dccp_state_names[state];
95 }
96
97 EXPORT_SYMBOL_GPL(dccp_state_name);
98
99 static inline int dccp_listen_start(struct sock *sk)
100 {
101         struct dccp_sock *dp = dccp_sk(sk);
102
103         dp->dccps_role = DCCP_ROLE_LISTEN;
104         /*
105          * Apps need to use setsockopt(DCCP_SOCKOPT_SERVICE)
106          * before calling listen()
107          */
108         if (dccp_service_not_initialized(sk))
109                 return -EPROTO;
110         return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
111 }
112
113 int dccp_disconnect(struct sock *sk, int flags)
114 {
115         struct inet_connection_sock *icsk = inet_csk(sk);
116         struct inet_sock *inet = inet_sk(sk);
117         int err = 0;
118         const int old_state = sk->sk_state;
119
120         if (old_state != DCCP_CLOSED)
121                 dccp_set_state(sk, DCCP_CLOSED);
122
123         /* ABORT function of RFC793 */
124         if (old_state == DCCP_LISTEN) {
125                 inet_csk_listen_stop(sk);
126         /* FIXME: do the active reset thing */
127         } else if (old_state == DCCP_REQUESTING)
128                 sk->sk_err = ECONNRESET;
129
130         dccp_clear_xmit_timers(sk);
131         __skb_queue_purge(&sk->sk_receive_queue);
132         if (sk->sk_send_head != NULL) {
133                 __kfree_skb(sk->sk_send_head);
134                 sk->sk_send_head = NULL;
135         }
136
137         inet->dport = 0;
138
139         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
140                 inet_reset_saddr(sk);
141
142         sk->sk_shutdown = 0;
143         sock_reset_flag(sk, SOCK_DONE);
144
145         icsk->icsk_backoff = 0;
146         inet_csk_delack_init(sk);
147         __sk_dst_reset(sk);
148
149         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
150
151         sk->sk_error_report(sk);
152         return err;
153 }
154
155 EXPORT_SYMBOL_GPL(dccp_disconnect);
156
157 /*
158  *      Wait for a DCCP event.
159  *
160  *      Note that we don't need to lock the socket, as the upper poll layers
161  *      take care of normal races (between the test and the event) and we don't
162  *      go look at any of the socket buffers directly.
163  */
164 unsigned int dccp_poll(struct file *file, struct socket *sock,
165                        poll_table *wait)
166 {
167         unsigned int mask;
168         struct sock *sk = sock->sk;
169
170         poll_wait(file, sk->sk_sleep, wait);
171         if (sk->sk_state == DCCP_LISTEN)
172                 return inet_csk_listen_poll(sk);
173
174         /* Socket is not locked. We are protected from async events
175            by poll logic and correct handling of state changes
176            made by another threads is impossible in any case.
177          */
178
179         mask = 0;
180         if (sk->sk_err)
181                 mask = POLLERR;
182
183         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
184                 mask |= POLLHUP;
185         if (sk->sk_shutdown & RCV_SHUTDOWN)
186                 mask |= POLLIN | POLLRDNORM;
187
188         /* Connected? */
189         if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
190                 if (atomic_read(&sk->sk_rmem_alloc) > 0)
191                         mask |= POLLIN | POLLRDNORM;
192
193                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
194                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
195                                 mask |= POLLOUT | POLLWRNORM;
196                         } else {  /* send SIGIO later */
197                                 set_bit(SOCK_ASYNC_NOSPACE,
198                                         &sk->sk_socket->flags);
199                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
200
201                                 /* Race breaker. If space is freed after
202                                  * wspace test but before the flags are set,
203                                  * IO signal will be lost.
204                                  */
205                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
206                                         mask |= POLLOUT | POLLWRNORM;
207                         }
208                 }
209         }
210         return mask;
211 }
212
213 EXPORT_SYMBOL_GPL(dccp_poll);
214
215 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
216 {
217         dccp_pr_debug("entry\n");
218         return -ENOIOCTLCMD;
219 }
220
221 EXPORT_SYMBOL_GPL(dccp_ioctl);
222
223 static int dccp_setsockopt_service(struct sock *sk, const u32 service,
224                                    char __user *optval, int optlen)
225 {
226         struct dccp_sock *dp = dccp_sk(sk);
227         struct dccp_service_list *sl = NULL;
228
229         if (service == DCCP_SERVICE_INVALID_VALUE || 
230             optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
231                 return -EINVAL;
232
233         if (optlen > sizeof(service)) {
234                 sl = kmalloc(optlen, GFP_KERNEL);
235                 if (sl == NULL)
236                         return -ENOMEM;
237
238                 sl->dccpsl_nr = optlen / sizeof(u32) - 1;
239                 if (copy_from_user(sl->dccpsl_list,
240                                    optval + sizeof(service),
241                                    optlen - sizeof(service)) ||
242                     dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
243                         kfree(sl);
244                         return -EFAULT;
245                 }
246         }
247
248         lock_sock(sk);
249         dp->dccps_service = service;
250
251         kfree(dp->dccps_service_list);
252
253         dp->dccps_service_list = sl;
254         release_sock(sk);
255         return 0;
256 }
257
258 int dccp_setsockopt(struct sock *sk, int level, int optname,
259                     char __user *optval, int optlen)
260 {
261         struct dccp_sock *dp;
262         int err;
263         int val;
264
265         if (level != SOL_DCCP)
266                 return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
267                                                              optname, optval,
268                                                              optlen);
269
270         if (optlen < sizeof(int))
271                 return -EINVAL;
272
273         if (get_user(val, (int __user *)optval))
274                 return -EFAULT;
275
276         if (optname == DCCP_SOCKOPT_SERVICE)
277                 return dccp_setsockopt_service(sk, val, optval, optlen);
278
279         lock_sock(sk);
280         dp = dccp_sk(sk);
281         err = 0;
282
283         switch (optname) {
284         case DCCP_SOCKOPT_PACKET_SIZE:
285                 dp->dccps_packet_size = val;
286                 break;
287         default:
288                 err = -ENOPROTOOPT;
289                 break;
290         }
291         
292         release_sock(sk);
293         return err;
294 }
295
296 EXPORT_SYMBOL_GPL(dccp_setsockopt);
297
298 static int dccp_getsockopt_service(struct sock *sk, int len,
299                                    u32 __user *optval,
300                                    int __user *optlen)
301 {
302         const struct dccp_sock *dp = dccp_sk(sk);
303         const struct dccp_service_list *sl;
304         int err = -ENOENT, slen = 0, total_len = sizeof(u32);
305
306         lock_sock(sk);
307         if (dccp_service_not_initialized(sk))
308                 goto out;
309
310         if ((sl = dp->dccps_service_list) != NULL) {
311                 slen = sl->dccpsl_nr * sizeof(u32);
312                 total_len += slen;
313         }
314
315         err = -EINVAL;
316         if (total_len > len)
317                 goto out;
318
319         err = 0;
320         if (put_user(total_len, optlen) ||
321             put_user(dp->dccps_service, optval) ||
322             (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
323                 err = -EFAULT;
324 out:
325         release_sock(sk);
326         return err;
327 }
328
329 int dccp_getsockopt(struct sock *sk, int level, int optname,
330                     char __user *optval, int __user *optlen)
331 {
332         struct dccp_sock *dp;
333         int val, len;
334
335         if (level != SOL_DCCP)
336                 return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
337                                                              optname, optval,
338                                                              optlen);
339         if (get_user(len, optlen))
340                 return -EFAULT;
341
342         if (len < sizeof(int))
343                 return -EINVAL;
344
345         dp = dccp_sk(sk);
346
347         switch (optname) {
348         case DCCP_SOCKOPT_PACKET_SIZE:
349                 val = dp->dccps_packet_size;
350                 len = sizeof(dp->dccps_packet_size);
351                 break;
352         case DCCP_SOCKOPT_SERVICE:
353                 return dccp_getsockopt_service(sk, len,
354                                                (u32 __user *)optval, optlen);
355         case 128 ... 191:
356                 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
357                                              len, (u32 __user *)optval, optlen);
358         case 192 ... 255:
359                 return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
360                                              len, (u32 __user *)optval, optlen);
361         default:
362                 return -ENOPROTOOPT;
363         }
364
365         if (put_user(len, optlen) || copy_to_user(optval, &val, len))
366                 return -EFAULT;
367
368         return 0;
369 }
370
371 EXPORT_SYMBOL_GPL(dccp_getsockopt);
372
373 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
374                  size_t len)
375 {
376         const struct dccp_sock *dp = dccp_sk(sk);
377         const int flags = msg->msg_flags;
378         const int noblock = flags & MSG_DONTWAIT;
379         struct sk_buff *skb;
380         int rc, size;
381         long timeo;
382
383         if (len > dp->dccps_mss_cache)
384                 return -EMSGSIZE;
385
386         lock_sock(sk);
387         timeo = sock_sndtimeo(sk, noblock);
388
389         /*
390          * We have to use sk_stream_wait_connect here to set sk_write_pending,
391          * so that the trick in dccp_rcv_request_sent_state_process.
392          */
393         /* Wait for a connection to finish. */
394         if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
395                 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
396                         goto out_release;
397
398         size = sk->sk_prot->max_header + len;
399         release_sock(sk);
400         skb = sock_alloc_send_skb(sk, size, noblock, &rc);
401         lock_sock(sk);
402         if (skb == NULL)
403                 goto out_release;
404
405         skb_reserve(skb, sk->sk_prot->max_header);
406         rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
407         if (rc != 0)
408                 goto out_discard;
409
410         rc = dccp_write_xmit(sk, skb, &timeo);
411         /*
412          * XXX we don't use sk_write_queue, so just discard the packet.
413          *     Current plan however is to _use_ sk_write_queue with
414          *     an algorith similar to tcp_sendmsg, where the main difference
415          *     is that in DCCP we have to respect packet boundaries, so
416          *     no coalescing of skbs.
417          *
418          *     This bug was _quickly_ found & fixed by just looking at an OSTRA
419          *     generated callgraph 8) -acme
420          */
421 out_release:
422         release_sock(sk);
423         return rc ? : len;
424 out_discard:
425         kfree_skb(skb);
426         goto out_release;
427 }
428
429 EXPORT_SYMBOL_GPL(dccp_sendmsg);
430
431 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
432                  size_t len, int nonblock, int flags, int *addr_len)
433 {
434         const struct dccp_hdr *dh;
435         long timeo;
436
437         lock_sock(sk);
438
439         if (sk->sk_state == DCCP_LISTEN) {
440                 len = -ENOTCONN;
441                 goto out;
442         }
443
444         timeo = sock_rcvtimeo(sk, nonblock);
445
446         do {
447                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
448
449                 if (skb == NULL)
450                         goto verify_sock_status;
451
452                 dh = dccp_hdr(skb);
453
454                 if (dh->dccph_type == DCCP_PKT_DATA ||
455                     dh->dccph_type == DCCP_PKT_DATAACK)
456                         goto found_ok_skb;
457
458                 if (dh->dccph_type == DCCP_PKT_RESET ||
459                     dh->dccph_type == DCCP_PKT_CLOSE) {
460                         dccp_pr_debug("found fin ok!\n");
461                         len = 0;
462                         goto found_fin_ok;
463                 }
464                 dccp_pr_debug("packet_type=%s\n",
465                               dccp_packet_name(dh->dccph_type));
466                 sk_eat_skb(sk, skb);
467 verify_sock_status:
468                 if (sock_flag(sk, SOCK_DONE)) {
469                         len = 0;
470                         break;
471                 }
472
473                 if (sk->sk_err) {
474                         len = sock_error(sk);
475                         break;
476                 }
477
478                 if (sk->sk_shutdown & RCV_SHUTDOWN) {
479                         len = 0;
480                         break;
481                 }
482
483                 if (sk->sk_state == DCCP_CLOSED) {
484                         if (!sock_flag(sk, SOCK_DONE)) {
485                                 /* This occurs when user tries to read
486                                  * from never connected socket.
487                                  */
488                                 len = -ENOTCONN;
489                                 break;
490                         }
491                         len = 0;
492                         break;
493                 }
494
495                 if (!timeo) {
496                         len = -EAGAIN;
497                         break;
498                 }
499
500                 if (signal_pending(current)) {
501                         len = sock_intr_errno(timeo);
502                         break;
503                 }
504
505                 sk_wait_data(sk, &timeo);
506                 continue;
507         found_ok_skb:
508                 if (len > skb->len)
509                         len = skb->len;
510                 else if (len < skb->len)
511                         msg->msg_flags |= MSG_TRUNC;
512
513                 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
514                         /* Exception. Bailout! */
515                         len = -EFAULT;
516                         break;
517                 }
518         found_fin_ok:
519                 if (!(flags & MSG_PEEK))
520                         sk_eat_skb(sk, skb);
521                 break;
522         } while (1);
523 out:
524         release_sock(sk);
525         return len;
526 }
527
528 EXPORT_SYMBOL_GPL(dccp_recvmsg);
529
530 int inet_dccp_listen(struct socket *sock, int backlog)
531 {
532         struct sock *sk = sock->sk;
533         unsigned char old_state;
534         int err;
535
536         lock_sock(sk);
537
538         err = -EINVAL;
539         if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
540                 goto out;
541
542         old_state = sk->sk_state;
543         if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
544                 goto out;
545
546         /* Really, if the socket is already in listen state
547          * we can only allow the backlog to be adjusted.
548          */
549         if (old_state != DCCP_LISTEN) {
550                 /*
551                  * FIXME: here it probably should be sk->sk_prot->listen_start
552                  * see tcp_listen_start
553                  */
554                 err = dccp_listen_start(sk);
555                 if (err)
556                         goto out;
557         }
558         sk->sk_max_ack_backlog = backlog;
559         err = 0;
560
561 out:
562         release_sock(sk);
563         return err;
564 }
565
566 EXPORT_SYMBOL_GPL(inet_dccp_listen);
567
568 static const unsigned char dccp_new_state[] = {
569         /* current state:   new state:      action:     */
570         [0]               = DCCP_CLOSED,
571         [DCCP_OPEN]       = DCCP_CLOSING | DCCP_ACTION_FIN,
572         [DCCP_REQUESTING] = DCCP_CLOSED,
573         [DCCP_PARTOPEN]   = DCCP_CLOSING | DCCP_ACTION_FIN,
574         [DCCP_LISTEN]     = DCCP_CLOSED,
575         [DCCP_RESPOND]    = DCCP_CLOSED,
576         [DCCP_CLOSING]    = DCCP_CLOSED,
577         [DCCP_TIME_WAIT]  = DCCP_CLOSED,
578         [DCCP_CLOSED]     = DCCP_CLOSED,
579 };
580
581 static int dccp_close_state(struct sock *sk)
582 {
583         const int next = dccp_new_state[sk->sk_state];
584         const int ns = next & DCCP_STATE_MASK;
585
586         if (ns != sk->sk_state)
587                 dccp_set_state(sk, ns);
588
589         return next & DCCP_ACTION_FIN;
590 }
591
592 void dccp_close(struct sock *sk, long timeout)
593 {
594         struct sk_buff *skb;
595
596         lock_sock(sk);
597
598         sk->sk_shutdown = SHUTDOWN_MASK;
599
600         if (sk->sk_state == DCCP_LISTEN) {
601                 dccp_set_state(sk, DCCP_CLOSED);
602
603                 /* Special case. */
604                 inet_csk_listen_stop(sk);
605
606                 goto adjudge_to_death;
607         }
608
609         /*
610          * We need to flush the recv. buffs.  We do this only on the
611          * descriptor close, not protocol-sourced closes, because the
612           *reader process may not have drained the data yet!
613          */
614         /* FIXME: check for unread data */
615         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
616                 __kfree_skb(skb);
617         }
618
619         if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
620                 /* Check zero linger _after_ checking for unread data. */
621                 sk->sk_prot->disconnect(sk, 0);
622         } else if (dccp_close_state(sk)) {
623                 dccp_send_close(sk, 1);
624         }
625
626         sk_stream_wait_close(sk, timeout);
627
628 adjudge_to_death:
629         /*
630          * It is the last release_sock in its life. It will remove backlog.
631          */
632         release_sock(sk);
633         /*
634          * Now socket is owned by kernel and we acquire BH lock
635          * to finish close. No need to check for user refs.
636          */
637         local_bh_disable();
638         bh_lock_sock(sk);
639         BUG_TRAP(!sock_owned_by_user(sk));
640
641         sock_hold(sk);
642         sock_orphan(sk);
643
644         /*
645          * The last release_sock may have processed the CLOSE or RESET
646          * packet moving sock to CLOSED state, if not we have to fire
647          * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
648          * in draft-ietf-dccp-spec-11. -acme
649          */
650         if (sk->sk_state == DCCP_CLOSING) {
651                 /* FIXME: should start at 2 * RTT */
652                 /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
653                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
654                                           inet_csk(sk)->icsk_rto,
655                                           DCCP_RTO_MAX);
656 #if 0
657                 /* Yeah, we should use sk->sk_prot->orphan_count, etc */
658                 dccp_set_state(sk, DCCP_CLOSED);
659 #endif
660         }
661
662         atomic_inc(sk->sk_prot->orphan_count);
663         if (sk->sk_state == DCCP_CLOSED)
664                 inet_csk_destroy_sock(sk);
665
666         /* Otherwise, socket is reprieved until protocol close. */
667
668         bh_unlock_sock(sk);
669         local_bh_enable();
670         sock_put(sk);
671 }
672
673 EXPORT_SYMBOL_GPL(dccp_close);
674
675 void dccp_shutdown(struct sock *sk, int how)
676 {
677         dccp_pr_debug("entry\n");
678 }
679
680 EXPORT_SYMBOL_GPL(dccp_shutdown);
681
682 static const struct proto_ops inet_dccp_ops = {
683         .family         = PF_INET,
684         .owner          = THIS_MODULE,
685         .release        = inet_release,
686         .bind           = inet_bind,
687         .connect        = inet_stream_connect,
688         .socketpair     = sock_no_socketpair,
689         .accept         = inet_accept,
690         .getname        = inet_getname,
691         /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
692         .poll           = dccp_poll,
693         .ioctl          = inet_ioctl,
694         /* FIXME: work on inet_listen to rename it to sock_common_listen */
695         .listen         = inet_dccp_listen,
696         .shutdown       = inet_shutdown,
697         .setsockopt     = sock_common_setsockopt,
698         .getsockopt     = sock_common_getsockopt,
699         .sendmsg        = inet_sendmsg,
700         .recvmsg        = sock_common_recvmsg,
701         .mmap           = sock_no_mmap,
702         .sendpage       = sock_no_sendpage,
703 };
704
705 extern struct net_proto_family inet_family_ops;
706
707 static struct inet_protosw dccp_v4_protosw = {
708         .type           = SOCK_DCCP,
709         .protocol       = IPPROTO_DCCP,
710         .prot           = &dccp_prot,
711         .ops            = &inet_dccp_ops,
712         .capability     = -1,
713         .no_check       = 0,
714         .flags          = INET_PROTOSW_ICSK,
715 };
716
717 /*
718  * This is the global socket data structure used for responding to
719  * the Out-of-the-blue (OOTB) packets. A control sock will be created
720  * for this socket at the initialization time.
721  */
722 struct socket *dccp_ctl_socket;
723
724 static char dccp_ctl_socket_err_msg[] __initdata =
725         KERN_ERR "DCCP: Failed to create the control socket.\n";
726
727 static int __init dccp_ctl_sock_init(void)
728 {
729         int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
730                                   &dccp_ctl_socket);
731         if (rc < 0)
732                 printk(dccp_ctl_socket_err_msg);
733         else {
734                 dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
735                 inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
736
737                 /* Unhash it so that IP input processing does not even
738                  * see it, we do not wish this socket to see incoming
739                  * packets.
740                  */
741                 dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
742         }
743
744         return rc;
745 }
746
747 #ifdef CONFIG_IP_DCCP_UNLOAD_HACK
748 void dccp_ctl_sock_exit(void)
749 {
750         if (dccp_ctl_socket != NULL) {
751                 sock_release(dccp_ctl_socket);
752                 dccp_ctl_socket = NULL;
753         }
754 }
755
756 EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
757 #endif
758
759 static int __init init_dccp_v4_mibs(void)
760 {
761         int rc = -ENOMEM;
762
763         dccp_statistics[0] = alloc_percpu(struct dccp_mib);
764         if (dccp_statistics[0] == NULL)
765                 goto out;
766
767         dccp_statistics[1] = alloc_percpu(struct dccp_mib);
768         if (dccp_statistics[1] == NULL)
769                 goto out_free_one;
770
771         rc = 0;
772 out:
773         return rc;
774 out_free_one:
775         free_percpu(dccp_statistics[0]);
776         dccp_statistics[0] = NULL;
777         goto out;
778
779 }
780
781 static int thash_entries;
782 module_param(thash_entries, int, 0444);
783 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
784
785 #ifdef CONFIG_IP_DCCP_DEBUG
786 int dccp_debug;
787 module_param(dccp_debug, int, 0444);
788 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
789
790 EXPORT_SYMBOL_GPL(dccp_debug);
791 #endif
792
793 static int __init dccp_init(void)
794 {
795         unsigned long goal;
796         int ehash_order, bhash_order, i;
797         int rc = proto_register(&dccp_prot, 1);
798
799         if (rc)
800                 goto out;
801
802         dccp_hashinfo.bind_bucket_cachep =
803                 kmem_cache_create("dccp_bind_bucket",
804                                   sizeof(struct inet_bind_bucket), 0,
805                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
806         if (!dccp_hashinfo.bind_bucket_cachep)
807                 goto out_proto_unregister;
808
809         /*
810          * Size and allocate the main established and bind bucket
811          * hash tables.
812          *
813          * The methodology is similar to that of the buffer cache.
814          */
815         if (num_physpages >= (128 * 1024))
816                 goal = num_physpages >> (21 - PAGE_SHIFT);
817         else
818                 goal = num_physpages >> (23 - PAGE_SHIFT);
819
820         if (thash_entries)
821                 goal = (thash_entries *
822                         sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
823         for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
824                 ;
825         do {
826                 dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
827                                         sizeof(struct inet_ehash_bucket);
828                 dccp_hashinfo.ehash_size >>= 1;
829                 while (dccp_hashinfo.ehash_size &
830                        (dccp_hashinfo.ehash_size - 1))
831                         dccp_hashinfo.ehash_size--;
832                 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
833                         __get_free_pages(GFP_ATOMIC, ehash_order);
834         } while (!dccp_hashinfo.ehash && --ehash_order > 0);
835
836         if (!dccp_hashinfo.ehash) {
837                 printk(KERN_CRIT "Failed to allocate DCCP "
838                                  "established hash table\n");
839                 goto out_free_bind_bucket_cachep;
840         }
841
842         for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
843                 rwlock_init(&dccp_hashinfo.ehash[i].lock);
844                 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
845         }
846
847         bhash_order = ehash_order;
848
849         do {
850                 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
851                                         sizeof(struct inet_bind_hashbucket);
852                 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
853                     bhash_order > 0)
854                         continue;
855                 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
856                         __get_free_pages(GFP_ATOMIC, bhash_order);
857         } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
858
859         if (!dccp_hashinfo.bhash) {
860                 printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
861                 goto out_free_dccp_ehash;
862         }
863
864         for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
865                 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
866                 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
867         }
868
869         if (init_dccp_v4_mibs())
870                 goto out_free_dccp_bhash;
871
872         rc = -EAGAIN;
873         if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
874                 goto out_free_dccp_v4_mibs;
875
876         inet_register_protosw(&dccp_v4_protosw);
877
878         rc = dccp_ctl_sock_init();
879         if (rc)
880                 goto out_unregister_protosw;
881 out:
882         return rc;
883 out_unregister_protosw:
884         inet_unregister_protosw(&dccp_v4_protosw);
885         inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
886 out_free_dccp_v4_mibs:
887         free_percpu(dccp_statistics[0]);
888         free_percpu(dccp_statistics[1]);
889         dccp_statistics[0] = dccp_statistics[1] = NULL;
890 out_free_dccp_bhash:
891         free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
892         dccp_hashinfo.bhash = NULL;
893 out_free_dccp_ehash:
894         free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
895         dccp_hashinfo.ehash = NULL;
896 out_free_bind_bucket_cachep:
897         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
898         dccp_hashinfo.bind_bucket_cachep = NULL;
899 out_proto_unregister:
900         proto_unregister(&dccp_prot);
901         goto out;
902 }
903
904 static const char dccp_del_proto_err_msg[] __exitdata =
905         KERN_ERR "can't remove dccp net_protocol\n";
906
907 static void __exit dccp_fini(void)
908 {
909         inet_unregister_protosw(&dccp_v4_protosw);
910
911         if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
912                 printk(dccp_del_proto_err_msg);
913
914         free_percpu(dccp_statistics[0]);
915         free_percpu(dccp_statistics[1]);
916         free_pages((unsigned long)dccp_hashinfo.bhash,
917                    get_order(dccp_hashinfo.bhash_size *
918                              sizeof(struct inet_bind_hashbucket)));
919         free_pages((unsigned long)dccp_hashinfo.ehash,
920                    get_order(dccp_hashinfo.ehash_size *
921                              sizeof(struct inet_ehash_bucket)));
922         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
923         proto_unregister(&dccp_prot);
924 }
925
926 module_init(dccp_init);
927 module_exit(dccp_fini);
928
929 /*
930  * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
931  * values directly, Also cover the case where the protocol is not specified,
932  * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
933  */
934 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
935 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
936 MODULE_LICENSE("GPL");
937 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
938 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");