[DCCP]: Move dccp_hashinfo from ipv4.c to the core
[linux-2.6] / net / dccp / proto.c
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *      This program is free software; you can redistribute it and/or modify it
8  *      under the terms of the GNU General Public License version 2 as
9  *      published by the Free Software Foundation.
10  */
11
12 #include <linux/config.h>
13 #include <linux/dccp.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/if_arp.h>
22 #include <linux/init.h>
23 #include <linux/random.h>
24 #include <net/checksum.h>
25
26 #include <net/inet_common.h>
27 #include <net/inet_sock.h>
28 #include <net/protocol.h>
29 #include <net/sock.h>
30 #include <net/xfrm.h>
31
32 #include <asm/semaphore.h>
33 #include <linux/spinlock.h>
34 #include <linux/timer.h>
35 #include <linux/delay.h>
36 #include <linux/poll.h>
37
38 #include "ccid.h"
39 #include "dccp.h"
40 #include "feat.h"
41
42 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
43
44 EXPORT_SYMBOL_GPL(dccp_statistics);
45
46 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
47
48 EXPORT_SYMBOL_GPL(dccp_orphan_count);
49
50 struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
51         .lhash_lock     = RW_LOCK_UNLOCKED,
52         .lhash_users    = ATOMIC_INIT(0),
53         .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
54 };
55
56 EXPORT_SYMBOL_GPL(dccp_hashinfo);
57
58 static struct net_protocol dccp_protocol = {
59         .handler        = dccp_v4_rcv,
60         .err_handler    = dccp_v4_err,
61         .no_policy      = 1,
62 };
63
64 const char *dccp_packet_name(const int type)
65 {
66         static const char *dccp_packet_names[] = {
67                 [DCCP_PKT_REQUEST]  = "REQUEST",
68                 [DCCP_PKT_RESPONSE] = "RESPONSE",
69                 [DCCP_PKT_DATA]     = "DATA",
70                 [DCCP_PKT_ACK]      = "ACK",
71                 [DCCP_PKT_DATAACK]  = "DATAACK",
72                 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
73                 [DCCP_PKT_CLOSE]    = "CLOSE",
74                 [DCCP_PKT_RESET]    = "RESET",
75                 [DCCP_PKT_SYNC]     = "SYNC",
76                 [DCCP_PKT_SYNCACK]  = "SYNCACK",
77         };
78
79         if (type >= DCCP_NR_PKT_TYPES)
80                 return "INVALID";
81         else
82                 return dccp_packet_names[type];
83 }
84
85 EXPORT_SYMBOL_GPL(dccp_packet_name);
86
87 const char *dccp_state_name(const int state)
88 {
89         static char *dccp_state_names[] = {
90         [DCCP_OPEN]       = "OPEN",
91         [DCCP_REQUESTING] = "REQUESTING",
92         [DCCP_PARTOPEN]   = "PARTOPEN",
93         [DCCP_LISTEN]     = "LISTEN",
94         [DCCP_RESPOND]    = "RESPOND",
95         [DCCP_CLOSING]    = "CLOSING",
96         [DCCP_TIME_WAIT]  = "TIME_WAIT",
97         [DCCP_CLOSED]     = "CLOSED",
98         };
99
100         if (state >= DCCP_MAX_STATES)
101                 return "INVALID STATE!";
102         else
103                 return dccp_state_names[state];
104 }
105
106 EXPORT_SYMBOL_GPL(dccp_state_name);
107
108 void dccp_hash(struct sock *sk)
109 {
110         inet_hash(&dccp_hashinfo, sk);
111 }
112
113 EXPORT_SYMBOL_GPL(dccp_hash);
114
115 void dccp_unhash(struct sock *sk)
116 {
117         inet_unhash(&dccp_hashinfo, sk);
118 }
119
120 EXPORT_SYMBOL_GPL(dccp_unhash);
121
122 int dccp_init_sock(struct sock *sk)
123 {
124         struct dccp_sock *dp = dccp_sk(sk);
125         struct inet_connection_sock *icsk = inet_csk(sk);
126         static int dccp_ctl_socket_init = 1;
127
128         dccp_options_init(&dp->dccps_options);
129         do_gettimeofday(&dp->dccps_epoch);
130
131         /*
132          * FIXME: We're hardcoding the CCID, and doing this at this point makes
133          * the listening (master) sock get CCID control blocks, which is not
134          * necessary, but for now, to not mess with the test userspace apps,
135          * lets leave it here, later the real solution is to do this in a
136          * setsockopt(CCIDs-I-want/accept). -acme
137          */
138         if (likely(!dccp_ctl_socket_init)) {
139                 int rc = dccp_feat_init(sk);
140
141                 if (rc)
142                         return rc;
143
144                 if (dp->dccps_options.dccpo_send_ack_vector) {
145                         dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
146                         if (dp->dccps_hc_rx_ackvec == NULL)
147                                 return -ENOMEM;
148                 }
149                 dp->dccps_hc_rx_ccid =
150                                 ccid_hc_rx_new(dp->dccps_options.dccpo_rx_ccid,
151                                                sk, GFP_KERNEL);
152                 dp->dccps_hc_tx_ccid =
153                                 ccid_hc_tx_new(dp->dccps_options.dccpo_tx_ccid,
154                                                sk, GFP_KERNEL);
155                 if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
156                              dp->dccps_hc_tx_ccid == NULL)) {
157                         ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
158                         ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
159                         if (dp->dccps_options.dccpo_send_ack_vector) {
160                                 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
161                                 dp->dccps_hc_rx_ackvec = NULL;
162                         }
163                         dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
164                         return -ENOMEM;
165                 }
166         } else {
167                 /* control socket doesn't need feat nego */
168                 INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
169                 INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
170                 dccp_ctl_socket_init = 0;
171         }
172
173         dccp_init_xmit_timers(sk);
174         icsk->icsk_rto          = DCCP_TIMEOUT_INIT;
175         sk->sk_state            = DCCP_CLOSED;
176         sk->sk_write_space      = dccp_write_space;
177         icsk->icsk_sync_mss     = dccp_sync_mss;
178         dp->dccps_mss_cache     = 536;
179         dp->dccps_role          = DCCP_ROLE_UNDEFINED;
180         dp->dccps_service       = DCCP_SERVICE_INVALID_VALUE;
181         dp->dccps_l_ack_ratio   = dp->dccps_r_ack_ratio = 1;
182
183         return 0;
184 }
185
186 EXPORT_SYMBOL_GPL(dccp_init_sock);
187
188 int dccp_destroy_sock(struct sock *sk)
189 {
190         struct dccp_sock *dp = dccp_sk(sk);
191
192         /*
193          * DCCP doesn't use sk_write_queue, just sk_send_head
194          * for retransmissions
195          */
196         if (sk->sk_send_head != NULL) {
197                 kfree_skb(sk->sk_send_head);
198                 sk->sk_send_head = NULL;
199         }
200
201         /* Clean up a referenced DCCP bind bucket. */
202         if (inet_csk(sk)->icsk_bind_hash != NULL)
203                 inet_put_port(&dccp_hashinfo, sk);
204
205         kfree(dp->dccps_service_list);
206         dp->dccps_service_list = NULL;
207
208         if (dp->dccps_options.dccpo_send_ack_vector) {
209                 dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
210                 dp->dccps_hc_rx_ackvec = NULL;
211         }
212         ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
213         ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
214         dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
215
216         /* clean up feature negotiation state */
217         dccp_feat_clean(sk);
218
219         return 0;
220 }
221
222 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
223
224 static inline int dccp_listen_start(struct sock *sk)
225 {
226         struct dccp_sock *dp = dccp_sk(sk);
227
228         dp->dccps_role = DCCP_ROLE_LISTEN;
229         /*
230          * Apps need to use setsockopt(DCCP_SOCKOPT_SERVICE)
231          * before calling listen()
232          */
233         if (dccp_service_not_initialized(sk))
234                 return -EPROTO;
235         return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
236 }
237
238 int dccp_disconnect(struct sock *sk, int flags)
239 {
240         struct inet_connection_sock *icsk = inet_csk(sk);
241         struct inet_sock *inet = inet_sk(sk);
242         int err = 0;
243         const int old_state = sk->sk_state;
244
245         if (old_state != DCCP_CLOSED)
246                 dccp_set_state(sk, DCCP_CLOSED);
247
248         /* ABORT function of RFC793 */
249         if (old_state == DCCP_LISTEN) {
250                 inet_csk_listen_stop(sk);
251         /* FIXME: do the active reset thing */
252         } else if (old_state == DCCP_REQUESTING)
253                 sk->sk_err = ECONNRESET;
254
255         dccp_clear_xmit_timers(sk);
256         __skb_queue_purge(&sk->sk_receive_queue);
257         if (sk->sk_send_head != NULL) {
258                 __kfree_skb(sk->sk_send_head);
259                 sk->sk_send_head = NULL;
260         }
261
262         inet->dport = 0;
263
264         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
265                 inet_reset_saddr(sk);
266
267         sk->sk_shutdown = 0;
268         sock_reset_flag(sk, SOCK_DONE);
269
270         icsk->icsk_backoff = 0;
271         inet_csk_delack_init(sk);
272         __sk_dst_reset(sk);
273
274         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
275
276         sk->sk_error_report(sk);
277         return err;
278 }
279
280 EXPORT_SYMBOL_GPL(dccp_disconnect);
281
282 /*
283  *      Wait for a DCCP event.
284  *
285  *      Note that we don't need to lock the socket, as the upper poll layers
286  *      take care of normal races (between the test and the event) and we don't
287  *      go look at any of the socket buffers directly.
288  */
289 unsigned int dccp_poll(struct file *file, struct socket *sock,
290                        poll_table *wait)
291 {
292         unsigned int mask;
293         struct sock *sk = sock->sk;
294
295         poll_wait(file, sk->sk_sleep, wait);
296         if (sk->sk_state == DCCP_LISTEN)
297                 return inet_csk_listen_poll(sk);
298
299         /* Socket is not locked. We are protected from async events
300            by poll logic and correct handling of state changes
301            made by another threads is impossible in any case.
302          */
303
304         mask = 0;
305         if (sk->sk_err)
306                 mask = POLLERR;
307
308         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
309                 mask |= POLLHUP;
310         if (sk->sk_shutdown & RCV_SHUTDOWN)
311                 mask |= POLLIN | POLLRDNORM;
312
313         /* Connected? */
314         if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
315                 if (atomic_read(&sk->sk_rmem_alloc) > 0)
316                         mask |= POLLIN | POLLRDNORM;
317
318                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
319                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
320                                 mask |= POLLOUT | POLLWRNORM;
321                         } else {  /* send SIGIO later */
322                                 set_bit(SOCK_ASYNC_NOSPACE,
323                                         &sk->sk_socket->flags);
324                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
325
326                                 /* Race breaker. If space is freed after
327                                  * wspace test but before the flags are set,
328                                  * IO signal will be lost.
329                                  */
330                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
331                                         mask |= POLLOUT | POLLWRNORM;
332                         }
333                 }
334         }
335         return mask;
336 }
337
338 EXPORT_SYMBOL_GPL(dccp_poll);
339
340 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
341 {
342         dccp_pr_debug("entry\n");
343         return -ENOIOCTLCMD;
344 }
345
346 EXPORT_SYMBOL_GPL(dccp_ioctl);
347
348 static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
349                                    char __user *optval, int optlen)
350 {
351         struct dccp_sock *dp = dccp_sk(sk);
352         struct dccp_service_list *sl = NULL;
353
354         if (service == DCCP_SERVICE_INVALID_VALUE || 
355             optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
356                 return -EINVAL;
357
358         if (optlen > sizeof(service)) {
359                 sl = kmalloc(optlen, GFP_KERNEL);
360                 if (sl == NULL)
361                         return -ENOMEM;
362
363                 sl->dccpsl_nr = optlen / sizeof(u32) - 1;
364                 if (copy_from_user(sl->dccpsl_list,
365                                    optval + sizeof(service),
366                                    optlen - sizeof(service)) ||
367                     dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
368                         kfree(sl);
369                         return -EFAULT;
370                 }
371         }
372
373         lock_sock(sk);
374         dp->dccps_service = service;
375
376         kfree(dp->dccps_service_list);
377
378         dp->dccps_service_list = sl;
379         release_sock(sk);
380         return 0;
381 }
382
383 /* byte 1 is feature.  the rest is the preference list */
384 static int dccp_setsockopt_change(struct sock *sk, int type,
385                                   struct dccp_so_feat __user *optval)
386 {
387         struct dccp_so_feat opt;
388         u8 *val;
389         int rc;
390
391         if (copy_from_user(&opt, optval, sizeof(opt)))
392                 return -EFAULT;
393
394         val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
395         if (!val)
396                 return -ENOMEM;
397
398         if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
399                 rc = -EFAULT;
400                 goto out_free_val;
401         }
402
403         rc = dccp_feat_change(sk, type, opt.dccpsf_feat, val, opt.dccpsf_len,
404                               GFP_KERNEL);
405         if (rc)
406                 goto out_free_val;
407
408 out:
409         return rc;
410
411 out_free_val:
412         kfree(val);
413         goto out;
414 }
415
416 int dccp_setsockopt(struct sock *sk, int level, int optname,
417                     char __user *optval, int optlen)
418 {
419         struct dccp_sock *dp;
420         int err;
421         int val;
422
423         if (level != SOL_DCCP)
424                 return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
425                                                              optname, optval,
426                                                              optlen);
427
428         if (optlen < sizeof(int))
429                 return -EINVAL;
430
431         if (get_user(val, (int __user *)optval))
432                 return -EFAULT;
433
434         if (optname == DCCP_SOCKOPT_SERVICE)
435                 return dccp_setsockopt_service(sk, val, optval, optlen);
436
437         lock_sock(sk);
438         dp = dccp_sk(sk);
439         err = 0;
440
441         switch (optname) {
442         case DCCP_SOCKOPT_PACKET_SIZE:
443                 dp->dccps_packet_size = val;
444                 break;
445
446         case DCCP_SOCKOPT_CHANGE_L:
447                 if (optlen != sizeof(struct dccp_so_feat))
448                         err = -EINVAL;
449                 else
450                         err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
451                                                      (struct dccp_so_feat *)
452                                                      optval);
453                 break;
454
455         case DCCP_SOCKOPT_CHANGE_R:
456                 if (optlen != sizeof(struct dccp_so_feat))
457                         err = -EINVAL;
458                 else
459                         err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
460                                                      (struct dccp_so_feat *)
461                                                      optval);
462                 break;
463
464         default:
465                 err = -ENOPROTOOPT;
466                 break;
467         }
468         
469         release_sock(sk);
470         return err;
471 }
472
473 EXPORT_SYMBOL_GPL(dccp_setsockopt);
474
475 static int dccp_getsockopt_service(struct sock *sk, int len,
476                                    __be32 __user *optval,
477                                    int __user *optlen)
478 {
479         const struct dccp_sock *dp = dccp_sk(sk);
480         const struct dccp_service_list *sl;
481         int err = -ENOENT, slen = 0, total_len = sizeof(u32);
482
483         lock_sock(sk);
484         if (dccp_service_not_initialized(sk))
485                 goto out;
486
487         if ((sl = dp->dccps_service_list) != NULL) {
488                 slen = sl->dccpsl_nr * sizeof(u32);
489                 total_len += slen;
490         }
491
492         err = -EINVAL;
493         if (total_len > len)
494                 goto out;
495
496         err = 0;
497         if (put_user(total_len, optlen) ||
498             put_user(dp->dccps_service, optval) ||
499             (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
500                 err = -EFAULT;
501 out:
502         release_sock(sk);
503         return err;
504 }
505
506 int dccp_getsockopt(struct sock *sk, int level, int optname,
507                     char __user *optval, int __user *optlen)
508 {
509         struct dccp_sock *dp;
510         int val, len;
511
512         if (level != SOL_DCCP)
513                 return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
514                                                              optname, optval,
515                                                              optlen);
516         if (get_user(len, optlen))
517                 return -EFAULT;
518
519         if (len < sizeof(int))
520                 return -EINVAL;
521
522         dp = dccp_sk(sk);
523
524         switch (optname) {
525         case DCCP_SOCKOPT_PACKET_SIZE:
526                 val = dp->dccps_packet_size;
527                 len = sizeof(dp->dccps_packet_size);
528                 break;
529         case DCCP_SOCKOPT_SERVICE:
530                 return dccp_getsockopt_service(sk, len,
531                                                (__be32 __user *)optval, optlen);
532         case 128 ... 191:
533                 return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
534                                              len, (u32 __user *)optval, optlen);
535         case 192 ... 255:
536                 return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
537                                              len, (u32 __user *)optval, optlen);
538         default:
539                 return -ENOPROTOOPT;
540         }
541
542         if (put_user(len, optlen) || copy_to_user(optval, &val, len))
543                 return -EFAULT;
544
545         return 0;
546 }
547
548 EXPORT_SYMBOL_GPL(dccp_getsockopt);
549
550 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
551                  size_t len)
552 {
553         const struct dccp_sock *dp = dccp_sk(sk);
554         const int flags = msg->msg_flags;
555         const int noblock = flags & MSG_DONTWAIT;
556         struct sk_buff *skb;
557         int rc, size;
558         long timeo;
559
560         if (len > dp->dccps_mss_cache)
561                 return -EMSGSIZE;
562
563         lock_sock(sk);
564         timeo = sock_sndtimeo(sk, noblock);
565
566         /*
567          * We have to use sk_stream_wait_connect here to set sk_write_pending,
568          * so that the trick in dccp_rcv_request_sent_state_process.
569          */
570         /* Wait for a connection to finish. */
571         if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
572                 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
573                         goto out_release;
574
575         size = sk->sk_prot->max_header + len;
576         release_sock(sk);
577         skb = sock_alloc_send_skb(sk, size, noblock, &rc);
578         lock_sock(sk);
579         if (skb == NULL)
580                 goto out_release;
581
582         skb_reserve(skb, sk->sk_prot->max_header);
583         rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
584         if (rc != 0)
585                 goto out_discard;
586
587         rc = dccp_write_xmit(sk, skb, &timeo);
588         /*
589          * XXX we don't use sk_write_queue, so just discard the packet.
590          *     Current plan however is to _use_ sk_write_queue with
591          *     an algorith similar to tcp_sendmsg, where the main difference
592          *     is that in DCCP we have to respect packet boundaries, so
593          *     no coalescing of skbs.
594          *
595          *     This bug was _quickly_ found & fixed by just looking at an OSTRA
596          *     generated callgraph 8) -acme
597          */
598 out_release:
599         release_sock(sk);
600         return rc ? : len;
601 out_discard:
602         kfree_skb(skb);
603         goto out_release;
604 }
605
606 EXPORT_SYMBOL_GPL(dccp_sendmsg);
607
608 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
609                  size_t len, int nonblock, int flags, int *addr_len)
610 {
611         const struct dccp_hdr *dh;
612         long timeo;
613
614         lock_sock(sk);
615
616         if (sk->sk_state == DCCP_LISTEN) {
617                 len = -ENOTCONN;
618                 goto out;
619         }
620
621         timeo = sock_rcvtimeo(sk, nonblock);
622
623         do {
624                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
625
626                 if (skb == NULL)
627                         goto verify_sock_status;
628
629                 dh = dccp_hdr(skb);
630
631                 if (dh->dccph_type == DCCP_PKT_DATA ||
632                     dh->dccph_type == DCCP_PKT_DATAACK)
633                         goto found_ok_skb;
634
635                 if (dh->dccph_type == DCCP_PKT_RESET ||
636                     dh->dccph_type == DCCP_PKT_CLOSE) {
637                         dccp_pr_debug("found fin ok!\n");
638                         len = 0;
639                         goto found_fin_ok;
640                 }
641                 dccp_pr_debug("packet_type=%s\n",
642                               dccp_packet_name(dh->dccph_type));
643                 sk_eat_skb(sk, skb);
644 verify_sock_status:
645                 if (sock_flag(sk, SOCK_DONE)) {
646                         len = 0;
647                         break;
648                 }
649
650                 if (sk->sk_err) {
651                         len = sock_error(sk);
652                         break;
653                 }
654
655                 if (sk->sk_shutdown & RCV_SHUTDOWN) {
656                         len = 0;
657                         break;
658                 }
659
660                 if (sk->sk_state == DCCP_CLOSED) {
661                         if (!sock_flag(sk, SOCK_DONE)) {
662                                 /* This occurs when user tries to read
663                                  * from never connected socket.
664                                  */
665                                 len = -ENOTCONN;
666                                 break;
667                         }
668                         len = 0;
669                         break;
670                 }
671
672                 if (!timeo) {
673                         len = -EAGAIN;
674                         break;
675                 }
676
677                 if (signal_pending(current)) {
678                         len = sock_intr_errno(timeo);
679                         break;
680                 }
681
682                 sk_wait_data(sk, &timeo);
683                 continue;
684         found_ok_skb:
685                 if (len > skb->len)
686                         len = skb->len;
687                 else if (len < skb->len)
688                         msg->msg_flags |= MSG_TRUNC;
689
690                 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
691                         /* Exception. Bailout! */
692                         len = -EFAULT;
693                         break;
694                 }
695         found_fin_ok:
696                 if (!(flags & MSG_PEEK))
697                         sk_eat_skb(sk, skb);
698                 break;
699         } while (1);
700 out:
701         release_sock(sk);
702         return len;
703 }
704
705 EXPORT_SYMBOL_GPL(dccp_recvmsg);
706
707 int inet_dccp_listen(struct socket *sock, int backlog)
708 {
709         struct sock *sk = sock->sk;
710         unsigned char old_state;
711         int err;
712
713         lock_sock(sk);
714
715         err = -EINVAL;
716         if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
717                 goto out;
718
719         old_state = sk->sk_state;
720         if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
721                 goto out;
722
723         /* Really, if the socket is already in listen state
724          * we can only allow the backlog to be adjusted.
725          */
726         if (old_state != DCCP_LISTEN) {
727                 /*
728                  * FIXME: here it probably should be sk->sk_prot->listen_start
729                  * see tcp_listen_start
730                  */
731                 err = dccp_listen_start(sk);
732                 if (err)
733                         goto out;
734         }
735         sk->sk_max_ack_backlog = backlog;
736         err = 0;
737
738 out:
739         release_sock(sk);
740         return err;
741 }
742
743 EXPORT_SYMBOL_GPL(inet_dccp_listen);
744
745 static const unsigned char dccp_new_state[] = {
746         /* current state:   new state:      action:     */
747         [0]               = DCCP_CLOSED,
748         [DCCP_OPEN]       = DCCP_CLOSING | DCCP_ACTION_FIN,
749         [DCCP_REQUESTING] = DCCP_CLOSED,
750         [DCCP_PARTOPEN]   = DCCP_CLOSING | DCCP_ACTION_FIN,
751         [DCCP_LISTEN]     = DCCP_CLOSED,
752         [DCCP_RESPOND]    = DCCP_CLOSED,
753         [DCCP_CLOSING]    = DCCP_CLOSED,
754         [DCCP_TIME_WAIT]  = DCCP_CLOSED,
755         [DCCP_CLOSED]     = DCCP_CLOSED,
756 };
757
758 static int dccp_close_state(struct sock *sk)
759 {
760         const int next = dccp_new_state[sk->sk_state];
761         const int ns = next & DCCP_STATE_MASK;
762
763         if (ns != sk->sk_state)
764                 dccp_set_state(sk, ns);
765
766         return next & DCCP_ACTION_FIN;
767 }
768
769 void dccp_close(struct sock *sk, long timeout)
770 {
771         struct sk_buff *skb;
772
773         lock_sock(sk);
774
775         sk->sk_shutdown = SHUTDOWN_MASK;
776
777         if (sk->sk_state == DCCP_LISTEN) {
778                 dccp_set_state(sk, DCCP_CLOSED);
779
780                 /* Special case. */
781                 inet_csk_listen_stop(sk);
782
783                 goto adjudge_to_death;
784         }
785
786         /*
787          * We need to flush the recv. buffs.  We do this only on the
788          * descriptor close, not protocol-sourced closes, because the
789           *reader process may not have drained the data yet!
790          */
791         /* FIXME: check for unread data */
792         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
793                 __kfree_skb(skb);
794         }
795
796         if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
797                 /* Check zero linger _after_ checking for unread data. */
798                 sk->sk_prot->disconnect(sk, 0);
799         } else if (dccp_close_state(sk)) {
800                 dccp_send_close(sk, 1);
801         }
802
803         sk_stream_wait_close(sk, timeout);
804
805 adjudge_to_death:
806         /*
807          * It is the last release_sock in its life. It will remove backlog.
808          */
809         release_sock(sk);
810         /*
811          * Now socket is owned by kernel and we acquire BH lock
812          * to finish close. No need to check for user refs.
813          */
814         local_bh_disable();
815         bh_lock_sock(sk);
816         BUG_TRAP(!sock_owned_by_user(sk));
817
818         sock_hold(sk);
819         sock_orphan(sk);
820
821         /*
822          * The last release_sock may have processed the CLOSE or RESET
823          * packet moving sock to CLOSED state, if not we have to fire
824          * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
825          * in draft-ietf-dccp-spec-11. -acme
826          */
827         if (sk->sk_state == DCCP_CLOSING) {
828                 /* FIXME: should start at 2 * RTT */
829                 /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
830                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
831                                           inet_csk(sk)->icsk_rto,
832                                           DCCP_RTO_MAX);
833 #if 0
834                 /* Yeah, we should use sk->sk_prot->orphan_count, etc */
835                 dccp_set_state(sk, DCCP_CLOSED);
836 #endif
837         }
838
839         atomic_inc(sk->sk_prot->orphan_count);
840         if (sk->sk_state == DCCP_CLOSED)
841                 inet_csk_destroy_sock(sk);
842
843         /* Otherwise, socket is reprieved until protocol close. */
844
845         bh_unlock_sock(sk);
846         local_bh_enable();
847         sock_put(sk);
848 }
849
850 EXPORT_SYMBOL_GPL(dccp_close);
851
852 void dccp_shutdown(struct sock *sk, int how)
853 {
854         dccp_pr_debug("entry\n");
855 }
856
857 EXPORT_SYMBOL_GPL(dccp_shutdown);
858
859 static const struct proto_ops inet_dccp_ops = {
860         .family         = PF_INET,
861         .owner          = THIS_MODULE,
862         .release        = inet_release,
863         .bind           = inet_bind,
864         .connect        = inet_stream_connect,
865         .socketpair     = sock_no_socketpair,
866         .accept         = inet_accept,
867         .getname        = inet_getname,
868         /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
869         .poll           = dccp_poll,
870         .ioctl          = inet_ioctl,
871         /* FIXME: work on inet_listen to rename it to sock_common_listen */
872         .listen         = inet_dccp_listen,
873         .shutdown       = inet_shutdown,
874         .setsockopt     = sock_common_setsockopt,
875         .getsockopt     = sock_common_getsockopt,
876         .sendmsg        = inet_sendmsg,
877         .recvmsg        = sock_common_recvmsg,
878         .mmap           = sock_no_mmap,
879         .sendpage       = sock_no_sendpage,
880 };
881
882 extern struct net_proto_family inet_family_ops;
883
884 static struct inet_protosw dccp_v4_protosw = {
885         .type           = SOCK_DCCP,
886         .protocol       = IPPROTO_DCCP,
887         .prot           = &dccp_prot,
888         .ops            = &inet_dccp_ops,
889         .capability     = -1,
890         .no_check       = 0,
891         .flags          = INET_PROTOSW_ICSK,
892 };
893
894 /*
895  * This is the global socket data structure used for responding to
896  * the Out-of-the-blue (OOTB) packets. A control sock will be created
897  * for this socket at the initialization time.
898  */
899 struct socket *dccp_ctl_socket;
900
901 static char dccp_ctl_socket_err_msg[] __initdata =
902         KERN_ERR "DCCP: Failed to create the control socket.\n";
903
904 static int __init dccp_ctl_sock_init(void)
905 {
906         int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
907                                   &dccp_ctl_socket);
908         if (rc < 0)
909                 printk(dccp_ctl_socket_err_msg);
910         else {
911                 dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
912                 inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
913
914                 /* Unhash it so that IP input processing does not even
915                  * see it, we do not wish this socket to see incoming
916                  * packets.
917                  */
918                 dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
919         }
920
921         return rc;
922 }
923
924 #ifdef CONFIG_IP_DCCP_UNLOAD_HACK
925 void dccp_ctl_sock_exit(void)
926 {
927         if (dccp_ctl_socket != NULL) {
928                 sock_release(dccp_ctl_socket);
929                 dccp_ctl_socket = NULL;
930         }
931 }
932
933 EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
934 #endif
935
936 static int __init init_dccp_v4_mibs(void)
937 {
938         int rc = -ENOMEM;
939
940         dccp_statistics[0] = alloc_percpu(struct dccp_mib);
941         if (dccp_statistics[0] == NULL)
942                 goto out;
943
944         dccp_statistics[1] = alloc_percpu(struct dccp_mib);
945         if (dccp_statistics[1] == NULL)
946                 goto out_free_one;
947
948         rc = 0;
949 out:
950         return rc;
951 out_free_one:
952         free_percpu(dccp_statistics[0]);
953         dccp_statistics[0] = NULL;
954         goto out;
955
956 }
957
958 static int thash_entries;
959 module_param(thash_entries, int, 0444);
960 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
961
962 #ifdef CONFIG_IP_DCCP_DEBUG
963 int dccp_debug;
964 module_param(dccp_debug, int, 0444);
965 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
966
967 EXPORT_SYMBOL_GPL(dccp_debug);
968 #endif
969
970 static int __init dccp_init(void)
971 {
972         unsigned long goal;
973         int ehash_order, bhash_order, i;
974         int rc = proto_register(&dccp_prot, 1);
975
976         if (rc)
977                 goto out;
978
979         rc = -ENOBUFS;
980         dccp_hashinfo.bind_bucket_cachep =
981                 kmem_cache_create("dccp_bind_bucket",
982                                   sizeof(struct inet_bind_bucket), 0,
983                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
984         if (!dccp_hashinfo.bind_bucket_cachep)
985                 goto out_proto_unregister;
986
987         /*
988          * Size and allocate the main established and bind bucket
989          * hash tables.
990          *
991          * The methodology is similar to that of the buffer cache.
992          */
993         if (num_physpages >= (128 * 1024))
994                 goal = num_physpages >> (21 - PAGE_SHIFT);
995         else
996                 goal = num_physpages >> (23 - PAGE_SHIFT);
997
998         if (thash_entries)
999                 goal = (thash_entries *
1000                         sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
1001         for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
1002                 ;
1003         do {
1004                 dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
1005                                         sizeof(struct inet_ehash_bucket);
1006                 dccp_hashinfo.ehash_size >>= 1;
1007                 while (dccp_hashinfo.ehash_size &
1008                        (dccp_hashinfo.ehash_size - 1))
1009                         dccp_hashinfo.ehash_size--;
1010                 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
1011                         __get_free_pages(GFP_ATOMIC, ehash_order);
1012         } while (!dccp_hashinfo.ehash && --ehash_order > 0);
1013
1014         if (!dccp_hashinfo.ehash) {
1015                 printk(KERN_CRIT "Failed to allocate DCCP "
1016                                  "established hash table\n");
1017                 goto out_free_bind_bucket_cachep;
1018         }
1019
1020         for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
1021                 rwlock_init(&dccp_hashinfo.ehash[i].lock);
1022                 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
1023         }
1024
1025         bhash_order = ehash_order;
1026
1027         do {
1028                 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
1029                                         sizeof(struct inet_bind_hashbucket);
1030                 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
1031                     bhash_order > 0)
1032                         continue;
1033                 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
1034                         __get_free_pages(GFP_ATOMIC, bhash_order);
1035         } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
1036
1037         if (!dccp_hashinfo.bhash) {
1038                 printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
1039                 goto out_free_dccp_ehash;
1040         }
1041
1042         for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
1043                 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
1044                 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1045         }
1046
1047         rc = init_dccp_v4_mibs();
1048         if (rc)
1049                 goto out_free_dccp_bhash;
1050
1051         rc = -EAGAIN;
1052         if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
1053                 goto out_free_dccp_v4_mibs;
1054
1055         inet_register_protosw(&dccp_v4_protosw);
1056
1057         rc = dccp_ackvec_init();
1058         if (rc)
1059                 goto out_unregister_protosw;
1060
1061         rc = dccp_sysctl_init();
1062         if (rc)
1063                 goto out_ackvec_exit;
1064
1065         rc = dccp_ctl_sock_init();
1066         if (rc)
1067                 goto out_sysctl_exit;
1068 out:
1069         return rc;
1070 out_sysctl_exit:
1071         dccp_sysctl_exit();
1072 out_ackvec_exit:
1073         dccp_ackvec_exit();
1074 out_unregister_protosw:
1075         inet_unregister_protosw(&dccp_v4_protosw);
1076         inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
1077 out_free_dccp_v4_mibs:
1078         free_percpu(dccp_statistics[0]);
1079         free_percpu(dccp_statistics[1]);
1080         dccp_statistics[0] = dccp_statistics[1] = NULL;
1081 out_free_dccp_bhash:
1082         free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1083         dccp_hashinfo.bhash = NULL;
1084 out_free_dccp_ehash:
1085         free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1086         dccp_hashinfo.ehash = NULL;
1087 out_free_bind_bucket_cachep:
1088         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1089         dccp_hashinfo.bind_bucket_cachep = NULL;
1090 out_proto_unregister:
1091         proto_unregister(&dccp_prot);
1092         goto out;
1093 }
1094
1095 static const char dccp_del_proto_err_msg[] __exitdata =
1096         KERN_ERR "can't remove dccp net_protocol\n";
1097
1098 static void __exit dccp_fini(void)
1099 {
1100         inet_unregister_protosw(&dccp_v4_protosw);
1101
1102         if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
1103                 printk(dccp_del_proto_err_msg);
1104
1105         free_percpu(dccp_statistics[0]);
1106         free_percpu(dccp_statistics[1]);
1107         free_pages((unsigned long)dccp_hashinfo.bhash,
1108                    get_order(dccp_hashinfo.bhash_size *
1109                              sizeof(struct inet_bind_hashbucket)));
1110         free_pages((unsigned long)dccp_hashinfo.ehash,
1111                    get_order(dccp_hashinfo.ehash_size *
1112                              sizeof(struct inet_ehash_bucket)));
1113         kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1114         proto_unregister(&dccp_prot);
1115         dccp_ackvec_exit();
1116         dccp_sysctl_exit();
1117 }
1118
1119 module_init(dccp_init);
1120 module_exit(dccp_fini);
1121
1122 /*
1123  * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
1124  * values directly, Also cover the case where the protocol is not specified,
1125  * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
1126  */
1127 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
1128 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
1129 MODULE_LICENSE("GPL");
1130 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
1131 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");