4 * An implementation of the DCCP protocol
5 * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
12 #include <linux/config.h>
13 #include <linux/dccp.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/if_arp.h>
22 #include <linux/init.h>
23 #include <linux/random.h>
24 #include <net/checksum.h>
26 #include <net/inet_common.h>
28 #include <net/protocol.h>
32 #include <asm/semaphore.h>
33 #include <linux/spinlock.h>
34 #include <linux/timer.h>
35 #include <linux/delay.h>
36 #include <linux/poll.h>
37 #include <linux/dccp.h>
42 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
44 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
46 static struct net_protocol dccp_protocol = {
47 .handler = dccp_v4_rcv,
48 .err_handler = dccp_v4_err,
51 const char *dccp_packet_name(const int type)
53 static const char *dccp_packet_names[] = {
54 [DCCP_PKT_REQUEST] = "REQUEST",
55 [DCCP_PKT_RESPONSE] = "RESPONSE",
56 [DCCP_PKT_DATA] = "DATA",
57 [DCCP_PKT_ACK] = "ACK",
58 [DCCP_PKT_DATAACK] = "DATAACK",
59 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
60 [DCCP_PKT_CLOSE] = "CLOSE",
61 [DCCP_PKT_RESET] = "RESET",
62 [DCCP_PKT_SYNC] = "SYNC",
63 [DCCP_PKT_SYNCACK] = "SYNCACK",
66 if (type >= DCCP_NR_PKT_TYPES)
69 return dccp_packet_names[type];
72 EXPORT_SYMBOL_GPL(dccp_packet_name);
74 const char *dccp_state_name(const int state)
76 static char *dccp_state_names[] = {
78 [DCCP_REQUESTING] = "REQUESTING",
79 [DCCP_PARTOPEN] = "PARTOPEN",
80 [DCCP_LISTEN] = "LISTEN",
81 [DCCP_RESPOND] = "RESPOND",
82 [DCCP_CLOSING] = "CLOSING",
83 [DCCP_TIME_WAIT] = "TIME_WAIT",
84 [DCCP_CLOSED] = "CLOSED",
87 if (state >= DCCP_MAX_STATES)
88 return "INVALID STATE!";
90 return dccp_state_names[state];
93 EXPORT_SYMBOL_GPL(dccp_state_name);
95 static inline int dccp_listen_start(struct sock *sk)
97 dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
98 return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
101 int dccp_disconnect(struct sock *sk, int flags)
103 struct inet_connection_sock *icsk = inet_csk(sk);
104 struct inet_sock *inet = inet_sk(sk);
106 const int old_state = sk->sk_state;
108 if (old_state != DCCP_CLOSED)
109 dccp_set_state(sk, DCCP_CLOSED);
111 /* ABORT function of RFC793 */
112 if (old_state == DCCP_LISTEN) {
113 inet_csk_listen_stop(sk);
114 /* FIXME: do the active reset thing */
115 } else if (old_state == DCCP_REQUESTING)
116 sk->sk_err = ECONNRESET;
118 dccp_clear_xmit_timers(sk);
119 __skb_queue_purge(&sk->sk_receive_queue);
120 if (sk->sk_send_head != NULL) {
121 __kfree_skb(sk->sk_send_head);
122 sk->sk_send_head = NULL;
127 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
128 inet_reset_saddr(sk);
131 sock_reset_flag(sk, SOCK_DONE);
133 icsk->icsk_backoff = 0;
134 inet_csk_delack_init(sk);
137 BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
139 sk->sk_error_report(sk);
144 * Wait for a DCCP event.
146 * Note that we don't need to lock the socket, as the upper poll layers
147 * take care of normal races (between the test and the event) and we don't
148 * go look at any of the socket buffers directly.
150 static unsigned int dccp_poll(struct file *file, struct socket *sock,
154 struct sock *sk = sock->sk;
156 poll_wait(file, sk->sk_sleep, wait);
157 if (sk->sk_state == DCCP_LISTEN)
158 return inet_csk_listen_poll(sk);
160 /* Socket is not locked. We are protected from async events
161 by poll logic and correct handling of state changes
162 made by another threads is impossible in any case.
169 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
171 if (sk->sk_shutdown & RCV_SHUTDOWN)
172 mask |= POLLIN | POLLRDNORM;
175 if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
176 if (atomic_read(&sk->sk_rmem_alloc) > 0)
177 mask |= POLLIN | POLLRDNORM;
179 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
180 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
181 mask |= POLLOUT | POLLWRNORM;
182 } else { /* send SIGIO later */
183 set_bit(SOCK_ASYNC_NOSPACE,
184 &sk->sk_socket->flags);
185 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
187 /* Race breaker. If space is freed after
188 * wspace test but before the flags are set,
189 * IO signal will be lost.
191 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
192 mask |= POLLOUT | POLLWRNORM;
199 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
201 dccp_pr_debug("entry\n");
205 int dccp_setsockopt(struct sock *sk, int level, int optname,
206 char __user *optval, int optlen)
208 struct dccp_sock *dp;
212 if (level != SOL_DCCP)
213 return ip_setsockopt(sk, level, optname, optval, optlen);
215 if (optlen < sizeof(int))
218 if (get_user(val, (int __user *)optval))
227 case DCCP_SOCKOPT_PACKET_SIZE:
228 dp->dccps_packet_size = val;
239 int dccp_getsockopt(struct sock *sk, int level, int optname,
240 char __user *optval, int __user *optlen)
242 struct dccp_sock *dp;
245 if (level != SOL_DCCP)
246 return ip_getsockopt(sk, level, optname, optval, optlen);
248 if (get_user(len, optlen))
251 len = min_t(unsigned int, len, sizeof(int));
258 case DCCP_SOCKOPT_PACKET_SIZE:
259 val = dp->dccps_packet_size;
265 if (put_user(len, optlen) || copy_to_user(optval, &val, len))
271 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
274 const struct dccp_sock *dp = dccp_sk(sk);
275 const int flags = msg->msg_flags;
276 const int noblock = flags & MSG_DONTWAIT;
281 if (len > dp->dccps_mss_cache)
285 timeo = sock_sndtimeo(sk, noblock);
288 * We have to use sk_stream_wait_connect here to set sk_write_pending,
289 * so that the trick in dccp_rcv_request_sent_state_process.
291 /* Wait for a connection to finish. */
292 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
293 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
296 size = sk->sk_prot->max_header + len;
298 skb = sock_alloc_send_skb(sk, size, noblock, &rc);
303 skb_reserve(skb, sk->sk_prot->max_header);
304 rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
308 rc = dccp_write_xmit(sk, skb, &timeo);
310 * XXX we don't use sk_write_queue, so just discard the packet.
311 * Current plan however is to _use_ sk_write_queue with
312 * an algorith similar to tcp_sendmsg, where the main difference
313 * is that in DCCP we have to respect packet boundaries, so
314 * no coalescing of skbs.
316 * This bug was _quickly_ found & fixed by just looking at an OSTRA
317 * generated callgraph 8) -acme
329 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
330 size_t len, int nonblock, int flags, int *addr_len)
332 const struct dccp_hdr *dh;
337 if (sk->sk_state == DCCP_LISTEN) {
342 timeo = sock_rcvtimeo(sk, nonblock);
345 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
348 goto verify_sock_status;
352 if (dh->dccph_type == DCCP_PKT_DATA ||
353 dh->dccph_type == DCCP_PKT_DATAACK)
356 if (dh->dccph_type == DCCP_PKT_RESET ||
357 dh->dccph_type == DCCP_PKT_CLOSE) {
358 dccp_pr_debug("found fin ok!\n");
362 dccp_pr_debug("packet_type=%s\n",
363 dccp_packet_name(dh->dccph_type));
366 if (sock_flag(sk, SOCK_DONE)) {
372 len = sock_error(sk);
376 if (sk->sk_shutdown & RCV_SHUTDOWN) {
381 if (sk->sk_state == DCCP_CLOSED) {
382 if (!sock_flag(sk, SOCK_DONE)) {
383 /* This occurs when user tries to read
384 * from never connected socket.
398 if (signal_pending(current)) {
399 len = sock_intr_errno(timeo);
403 sk_wait_data(sk, &timeo);
408 else if (len < skb->len)
409 msg->msg_flags |= MSG_TRUNC;
411 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
412 /* Exception. Bailout! */
417 if (!(flags & MSG_PEEK))
426 static int inet_dccp_listen(struct socket *sock, int backlog)
428 struct sock *sk = sock->sk;
429 unsigned char old_state;
435 if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
438 old_state = sk->sk_state;
439 if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
442 /* Really, if the socket is already in listen state
443 * we can only allow the backlog to be adjusted.
445 if (old_state != DCCP_LISTEN) {
447 * FIXME: here it probably should be sk->sk_prot->listen_start
448 * see tcp_listen_start
450 err = dccp_listen_start(sk);
454 sk->sk_max_ack_backlog = backlog;
462 static const unsigned char dccp_new_state[] = {
463 /* current state: new state: action: */
465 [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
466 [DCCP_REQUESTING] = DCCP_CLOSED,
467 [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
468 [DCCP_LISTEN] = DCCP_CLOSED,
469 [DCCP_RESPOND] = DCCP_CLOSED,
470 [DCCP_CLOSING] = DCCP_CLOSED,
471 [DCCP_TIME_WAIT] = DCCP_CLOSED,
472 [DCCP_CLOSED] = DCCP_CLOSED,
475 static int dccp_close_state(struct sock *sk)
477 const int next = dccp_new_state[sk->sk_state];
478 const int ns = next & DCCP_STATE_MASK;
480 if (ns != sk->sk_state)
481 dccp_set_state(sk, ns);
483 return next & DCCP_ACTION_FIN;
486 void dccp_close(struct sock *sk, long timeout)
492 sk->sk_shutdown = SHUTDOWN_MASK;
494 if (sk->sk_state == DCCP_LISTEN) {
495 dccp_set_state(sk, DCCP_CLOSED);
498 inet_csk_listen_stop(sk);
500 goto adjudge_to_death;
504 * We need to flush the recv. buffs. We do this only on the
505 * descriptor close, not protocol-sourced closes, because the
506 *reader process may not have drained the data yet!
508 /* FIXME: check for unread data */
509 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
513 if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
514 /* Check zero linger _after_ checking for unread data. */
515 sk->sk_prot->disconnect(sk, 0);
516 } else if (dccp_close_state(sk)) {
517 dccp_send_close(sk, 1);
520 sk_stream_wait_close(sk, timeout);
524 * It is the last release_sock in its life. It will remove backlog.
528 * Now socket is owned by kernel and we acquire BH lock
529 * to finish close. No need to check for user refs.
533 BUG_TRAP(!sock_owned_by_user(sk));
539 * The last release_sock may have processed the CLOSE or RESET
540 * packet moving sock to CLOSED state, if not we have to fire
541 * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
542 * in draft-ietf-dccp-spec-11. -acme
544 if (sk->sk_state == DCCP_CLOSING) {
545 /* FIXME: should start at 2 * RTT */
546 /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
547 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
548 inet_csk(sk)->icsk_rto,
551 /* Yeah, we should use sk->sk_prot->orphan_count, etc */
552 dccp_set_state(sk, DCCP_CLOSED);
556 atomic_inc(sk->sk_prot->orphan_count);
557 if (sk->sk_state == DCCP_CLOSED)
558 inet_csk_destroy_sock(sk);
560 /* Otherwise, socket is reprieved until protocol close. */
567 void dccp_shutdown(struct sock *sk, int how)
569 dccp_pr_debug("entry\n");
572 static struct proto_ops inet_dccp_ops = {
574 .owner = THIS_MODULE,
575 .release = inet_release,
577 .connect = inet_stream_connect,
578 .socketpair = sock_no_socketpair,
579 .accept = inet_accept,
580 .getname = inet_getname,
581 /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
584 /* FIXME: work on inet_listen to rename it to sock_common_listen */
585 .listen = inet_dccp_listen,
586 .shutdown = inet_shutdown,
587 .setsockopt = sock_common_setsockopt,
588 .getsockopt = sock_common_getsockopt,
589 .sendmsg = inet_sendmsg,
590 .recvmsg = sock_common_recvmsg,
591 .mmap = sock_no_mmap,
592 .sendpage = sock_no_sendpage,
595 extern struct net_proto_family inet_family_ops;
597 static struct inet_protosw dccp_v4_protosw = {
599 .protocol = IPPROTO_DCCP,
600 .prot = &dccp_v4_prot,
601 .ops = &inet_dccp_ops,
608 * This is the global socket data structure used for responding to
609 * the Out-of-the-blue (OOTB) packets. A control sock will be created
610 * for this socket at the initialization time.
612 struct socket *dccp_ctl_socket;
614 static char dccp_ctl_socket_err_msg[] __initdata =
615 KERN_ERR "DCCP: Failed to create the control socket.\n";
617 static int __init dccp_ctl_sock_init(void)
619 int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
622 printk(dccp_ctl_socket_err_msg);
624 dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
625 inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
627 /* Unhash it so that IP input processing does not even
628 * see it, we do not wish this socket to see incoming
631 dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
637 #ifdef CONFIG_IP_DCCP_UNLOAD_HACK
638 void dccp_ctl_sock_exit(void)
640 if (dccp_ctl_socket != NULL) {
641 sock_release(dccp_ctl_socket);
642 dccp_ctl_socket = NULL;
646 EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
649 static int __init init_dccp_v4_mibs(void)
653 dccp_statistics[0] = alloc_percpu(struct dccp_mib);
654 if (dccp_statistics[0] == NULL)
657 dccp_statistics[1] = alloc_percpu(struct dccp_mib);
658 if (dccp_statistics[1] == NULL)
665 free_percpu(dccp_statistics[0]);
666 dccp_statistics[0] = NULL;
671 static int thash_entries;
672 module_param(thash_entries, int, 0444);
673 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
675 #ifdef CONFIG_IP_DCCP_DEBUG
677 module_param(dccp_debug, int, 0444);
678 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
681 static int __init dccp_init(void)
684 int ehash_order, bhash_order, i;
685 int rc = proto_register(&dccp_v4_prot, 1);
690 dccp_hashinfo.bind_bucket_cachep =
691 kmem_cache_create("dccp_bind_bucket",
692 sizeof(struct inet_bind_bucket), 0,
693 SLAB_HWCACHE_ALIGN, NULL, NULL);
694 if (!dccp_hashinfo.bind_bucket_cachep)
695 goto out_proto_unregister;
698 * Size and allocate the main established and bind bucket
701 * The methodology is similar to that of the buffer cache.
703 if (num_physpages >= (128 * 1024))
704 goal = num_physpages >> (21 - PAGE_SHIFT);
706 goal = num_physpages >> (23 - PAGE_SHIFT);
709 goal = (thash_entries *
710 sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
711 for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
714 dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
715 sizeof(struct inet_ehash_bucket);
716 dccp_hashinfo.ehash_size >>= 1;
717 while (dccp_hashinfo.ehash_size &
718 (dccp_hashinfo.ehash_size - 1))
719 dccp_hashinfo.ehash_size--;
720 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
721 __get_free_pages(GFP_ATOMIC, ehash_order);
722 } while (!dccp_hashinfo.ehash && --ehash_order > 0);
724 if (!dccp_hashinfo.ehash) {
725 printk(KERN_CRIT "Failed to allocate DCCP "
726 "established hash table\n");
727 goto out_free_bind_bucket_cachep;
730 for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
731 rwlock_init(&dccp_hashinfo.ehash[i].lock);
732 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
735 bhash_order = ehash_order;
738 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
739 sizeof(struct inet_bind_hashbucket);
740 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
743 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
744 __get_free_pages(GFP_ATOMIC, bhash_order);
745 } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
747 if (!dccp_hashinfo.bhash) {
748 printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
749 goto out_free_dccp_ehash;
752 for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
753 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
754 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
757 if (init_dccp_v4_mibs())
758 goto out_free_dccp_bhash;
761 if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
762 goto out_free_dccp_v4_mibs;
764 inet_register_protosw(&dccp_v4_protosw);
766 rc = dccp_ctl_sock_init();
768 goto out_unregister_protosw;
771 out_unregister_protosw:
772 inet_unregister_protosw(&dccp_v4_protosw);
773 inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
774 out_free_dccp_v4_mibs:
775 free_percpu(dccp_statistics[0]);
776 free_percpu(dccp_statistics[1]);
777 dccp_statistics[0] = dccp_statistics[1] = NULL;
779 free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
780 dccp_hashinfo.bhash = NULL;
782 free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
783 dccp_hashinfo.ehash = NULL;
784 out_free_bind_bucket_cachep:
785 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
786 dccp_hashinfo.bind_bucket_cachep = NULL;
787 out_proto_unregister:
788 proto_unregister(&dccp_v4_prot);
792 static const char dccp_del_proto_err_msg[] __exitdata =
793 KERN_ERR "can't remove dccp net_protocol\n";
795 static void __exit dccp_fini(void)
797 inet_unregister_protosw(&dccp_v4_protosw);
799 if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
800 printk(dccp_del_proto_err_msg);
802 free_percpu(dccp_statistics[0]);
803 free_percpu(dccp_statistics[1]);
804 free_pages((unsigned long)dccp_hashinfo.bhash,
805 get_order(dccp_hashinfo.bhash_size *
806 sizeof(struct inet_bind_hashbucket)));
807 free_pages((unsigned long)dccp_hashinfo.ehash,
808 get_order(dccp_hashinfo.ehash_size *
809 sizeof(struct inet_ehash_bucket)));
810 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
811 proto_unregister(&dccp_v4_prot);
814 module_init(dccp_init);
815 module_exit(dccp_fini);
818 * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
819 * values directly, Also cover the case where the protocol is not specified,
820 * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
822 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
823 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
824 MODULE_LICENSE("GPL");
825 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
826 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");