2 * Copyright (c) 2006 Intel Corporation. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <linux/completion.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/err.h>
36 #include <linux/interrupt.h>
37 #include <linux/bitops.h>
38 #include <linux/random.h>
40 #include <rdma/ib_cache.h>
43 static void mcast_add_one(struct ib_device *device);
44 static void mcast_remove_one(struct ib_device *device);
46 static struct ib_client mcast_client = {
47 .name = "ib_multicast",
49 .remove = mcast_remove_one
52 static struct ib_sa_client sa_client;
53 static struct workqueue_struct *mcast_wq;
54 static union ib_gid mgid0;
59 struct mcast_device *dev;
63 struct completion comp;
68 struct ib_device *device;
69 struct ib_event_handler event_handler;
72 struct mcast_port port[0];
86 struct ib_sa_mcmember_rec rec;
88 struct mcast_port *port;
90 struct work_struct work;
91 struct list_head pending_list;
92 struct list_head active_list;
93 struct mcast_member *last_join;
96 enum mcast_state state;
97 struct ib_sa_query *query;
101 struct mcast_member {
102 struct ib_sa_multicast multicast;
103 struct ib_sa_client *client;
104 struct mcast_group *group;
105 struct list_head list;
106 enum mcast_state state;
108 struct completion comp;
111 static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
113 static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
116 static struct mcast_group *mcast_find(struct mcast_port *port,
119 struct rb_node *node = port->table.rb_node;
120 struct mcast_group *group;
124 group = rb_entry(node, struct mcast_group, node);
125 ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
130 node = node->rb_left;
132 node = node->rb_right;
137 static struct mcast_group *mcast_insert(struct mcast_port *port,
138 struct mcast_group *group,
139 int allow_duplicates)
141 struct rb_node **link = &port->table.rb_node;
142 struct rb_node *parent = NULL;
143 struct mcast_group *cur_group;
148 cur_group = rb_entry(parent, struct mcast_group, node);
150 ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
151 sizeof group->rec.mgid);
153 link = &(*link)->rb_left;
155 link = &(*link)->rb_right;
156 else if (allow_duplicates)
157 link = &(*link)->rb_left;
161 rb_link_node(&group->node, parent, link);
162 rb_insert_color(&group->node, &port->table);
166 static void deref_port(struct mcast_port *port)
168 if (atomic_dec_and_test(&port->refcount))
169 complete(&port->comp);
172 static void release_group(struct mcast_group *group)
174 struct mcast_port *port = group->port;
177 spin_lock_irqsave(&port->lock, flags);
178 if (atomic_dec_and_test(&group->refcount)) {
179 rb_erase(&group->node, &port->table);
180 spin_unlock_irqrestore(&port->lock, flags);
184 spin_unlock_irqrestore(&port->lock, flags);
187 static void deref_member(struct mcast_member *member)
189 if (atomic_dec_and_test(&member->refcount))
190 complete(&member->comp);
193 static void queue_join(struct mcast_member *member)
195 struct mcast_group *group = member->group;
198 spin_lock_irqsave(&group->lock, flags);
199 list_add(&member->list, &group->pending_list);
200 if (group->state == MCAST_IDLE) {
201 group->state = MCAST_BUSY;
202 atomic_inc(&group->refcount);
203 queue_work(mcast_wq, &group->work);
205 spin_unlock_irqrestore(&group->lock, flags);
209 * A multicast group has three types of members: full member, non member, and
210 * send only member. We need to keep track of the number of members of each
211 * type based on their join state. Adjust the number of members the belong to
212 * the specified join states.
214 static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
218 for (i = 0; i < 3; i++, join_state >>= 1)
219 if (join_state & 0x1)
220 group->members[i] += inc;
224 * If a multicast group has zero members left for a particular join state, but
225 * the group is still a member with the SA, we need to leave that join state.
226 * Determine which join states we still belong to, but that do not have any
229 static u8 get_leave_state(struct mcast_group *group)
234 for (i = 0; i < 3; i++)
235 if (!group->members[i])
236 leave_state |= (0x1 << i);
238 return leave_state & group->rec.join_state;
241 static int check_selector(ib_sa_comp_mask comp_mask,
242 ib_sa_comp_mask selector_mask,
243 ib_sa_comp_mask value_mask,
244 u8 selector, u8 src_value, u8 dst_value)
248 if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
253 err = (src_value <= dst_value);
256 err = (src_value >= dst_value);
259 err = (src_value != dst_value);
269 static int cmp_rec(struct ib_sa_mcmember_rec *src,
270 struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
272 /* MGID must already match */
274 if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
275 memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
277 if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
279 if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
281 if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
282 IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
285 if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
286 src->traffic_class != dst->traffic_class)
288 if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
290 if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
291 IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
292 src->rate, dst->rate))
294 if (check_selector(comp_mask,
295 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
296 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
297 dst->packet_life_time_selector,
298 src->packet_life_time, dst->packet_life_time))
300 if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
302 if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
303 src->flow_label != dst->flow_label)
305 if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
306 src->hop_limit != dst->hop_limit)
308 if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
311 /* join_state checked separately, proxy_join ignored */
316 static int send_join(struct mcast_group *group, struct mcast_member *member)
318 struct mcast_port *port = group->port;
321 group->last_join = member;
322 ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
323 port->port_num, IB_MGMT_METHOD_SET,
324 &member->multicast.rec,
325 member->multicast.comp_mask,
326 3000, GFP_KERNEL, join_handler, group,
329 group->query_id = ret;
335 static int send_leave(struct mcast_group *group, u8 leave_state)
337 struct mcast_port *port = group->port;
338 struct ib_sa_mcmember_rec rec;
342 rec.join_state = leave_state;
344 ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
345 port->port_num, IB_SA_METHOD_DELETE, &rec,
346 IB_SA_MCMEMBER_REC_MGID |
347 IB_SA_MCMEMBER_REC_PORT_GID |
348 IB_SA_MCMEMBER_REC_JOIN_STATE,
349 3000, GFP_KERNEL, leave_handler,
350 group, &group->query);
352 group->query_id = ret;
358 static void join_group(struct mcast_group *group, struct mcast_member *member,
361 member->state = MCAST_MEMBER;
362 adjust_membership(group, join_state, 1);
363 group->rec.join_state |= join_state;
364 member->multicast.rec = group->rec;
365 member->multicast.rec.join_state = join_state;
366 list_move(&member->list, &group->active_list);
369 static int fail_join(struct mcast_group *group, struct mcast_member *member,
372 spin_lock_irq(&group->lock);
373 list_del_init(&member->list);
374 spin_unlock_irq(&group->lock);
375 return member->multicast.callback(status, &member->multicast);
378 static void process_group_error(struct mcast_group *group)
380 struct mcast_member *member;
383 spin_lock_irq(&group->lock);
384 while (!list_empty(&group->active_list)) {
385 member = list_entry(group->active_list.next,
386 struct mcast_member, list);
387 atomic_inc(&member->refcount);
388 list_del_init(&member->list);
389 adjust_membership(group, member->multicast.rec.join_state, -1);
390 member->state = MCAST_ERROR;
391 spin_unlock_irq(&group->lock);
393 ret = member->multicast.callback(-ENETRESET,
395 deref_member(member);
397 ib_sa_free_multicast(&member->multicast);
398 spin_lock_irq(&group->lock);
401 group->rec.join_state = 0;
402 group->state = MCAST_BUSY;
403 spin_unlock_irq(&group->lock);
406 static void mcast_work_handler(struct work_struct *work)
408 struct mcast_group *group;
409 struct mcast_member *member;
410 struct ib_sa_multicast *multicast;
414 group = container_of(work, typeof(*group), work);
416 spin_lock_irq(&group->lock);
417 while (!list_empty(&group->pending_list) ||
418 (group->state == MCAST_ERROR)) {
420 if (group->state == MCAST_ERROR) {
421 spin_unlock_irq(&group->lock);
422 process_group_error(group);
426 member = list_entry(group->pending_list.next,
427 struct mcast_member, list);
428 multicast = &member->multicast;
429 join_state = multicast->rec.join_state;
430 atomic_inc(&member->refcount);
432 if (join_state == (group->rec.join_state & join_state)) {
433 status = cmp_rec(&group->rec, &multicast->rec,
434 multicast->comp_mask);
436 join_group(group, member, join_state);
438 list_del_init(&member->list);
439 spin_unlock_irq(&group->lock);
440 ret = multicast->callback(status, multicast);
442 spin_unlock_irq(&group->lock);
443 status = send_join(group, member);
445 deref_member(member);
448 ret = fail_join(group, member, status);
451 deref_member(member);
453 ib_sa_free_multicast(&member->multicast);
454 spin_lock_irq(&group->lock);
457 join_state = get_leave_state(group);
459 group->rec.join_state &= ~join_state;
460 spin_unlock_irq(&group->lock);
461 if (send_leave(group, join_state))
464 group->state = MCAST_IDLE;
465 spin_unlock_irq(&group->lock);
466 release_group(group);
471 * Fail a join request if it is still active - at the head of the pending queue.
473 static void process_join_error(struct mcast_group *group, int status)
475 struct mcast_member *member;
478 spin_lock_irq(&group->lock);
479 member = list_entry(group->pending_list.next,
480 struct mcast_member, list);
481 if (group->last_join == member) {
482 atomic_inc(&member->refcount);
483 list_del_init(&member->list);
484 spin_unlock_irq(&group->lock);
485 ret = member->multicast.callback(status, &member->multicast);
486 deref_member(member);
488 ib_sa_free_multicast(&member->multicast);
490 spin_unlock_irq(&group->lock);
493 static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
496 struct mcast_group *group = context;
499 process_join_error(group, status);
501 spin_lock_irq(&group->port->lock);
503 if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
504 rb_erase(&group->node, &group->port->table);
505 mcast_insert(group->port, group, 1);
507 spin_unlock_irq(&group->port->lock);
509 mcast_work_handler(&group->work);
512 static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
515 struct mcast_group *group = context;
517 mcast_work_handler(&group->work);
520 static struct mcast_group *acquire_group(struct mcast_port *port,
521 union ib_gid *mgid, gfp_t gfp_mask)
523 struct mcast_group *group, *cur_group;
527 is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
529 spin_lock_irqsave(&port->lock, flags);
530 group = mcast_find(port, mgid);
533 spin_unlock_irqrestore(&port->lock, flags);
536 group = kzalloc(sizeof *group, gfp_mask);
541 group->rec.mgid = *mgid;
542 INIT_LIST_HEAD(&group->pending_list);
543 INIT_LIST_HEAD(&group->active_list);
544 INIT_WORK(&group->work, mcast_work_handler);
545 spin_lock_init(&group->lock);
547 spin_lock_irqsave(&port->lock, flags);
548 cur_group = mcast_insert(port, group, is_mgid0);
553 atomic_inc(&port->refcount);
555 atomic_inc(&group->refcount);
556 spin_unlock_irqrestore(&port->lock, flags);
561 * We serialize all join requests to a single group to make our lives much
562 * easier. Otherwise, two users could try to join the same group
563 * simultaneously, with different configurations, one could leave while the
564 * join is in progress, etc., which makes locking around error recovery
567 struct ib_sa_multicast *
568 ib_sa_join_multicast(struct ib_sa_client *client,
569 struct ib_device *device, u8 port_num,
570 struct ib_sa_mcmember_rec *rec,
571 ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
572 int (*callback)(int status,
573 struct ib_sa_multicast *multicast),
576 struct mcast_device *dev;
577 struct mcast_member *member;
578 struct ib_sa_multicast *multicast;
581 dev = ib_get_client_data(device, &mcast_client);
583 return ERR_PTR(-ENODEV);
585 member = kmalloc(sizeof *member, gfp_mask);
587 return ERR_PTR(-ENOMEM);
589 ib_sa_client_get(client);
590 member->client = client;
591 member->multicast.rec = *rec;
592 member->multicast.comp_mask = comp_mask;
593 member->multicast.callback = callback;
594 member->multicast.context = context;
595 init_completion(&member->comp);
596 atomic_set(&member->refcount, 1);
597 member->state = MCAST_JOINING;
599 member->group = acquire_group(&dev->port[port_num - dev->start_port],
600 &rec->mgid, gfp_mask);
601 if (!member->group) {
607 * The user will get the multicast structure in their callback. They
608 * could then free the multicast structure before we can return from
609 * this routine. So we save the pointer to return before queuing
612 multicast = &member->multicast;
617 ib_sa_client_put(client);
621 EXPORT_SYMBOL(ib_sa_join_multicast);
623 void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
625 struct mcast_member *member;
626 struct mcast_group *group;
628 member = container_of(multicast, struct mcast_member, multicast);
629 group = member->group;
631 spin_lock_irq(&group->lock);
632 if (member->state == MCAST_MEMBER)
633 adjust_membership(group, multicast->rec.join_state, -1);
635 list_del_init(&member->list);
637 if (group->state == MCAST_IDLE) {
638 group->state = MCAST_BUSY;
639 spin_unlock_irq(&group->lock);
640 /* Continue to hold reference on group until callback */
641 queue_work(mcast_wq, &group->work);
643 spin_unlock_irq(&group->lock);
644 release_group(group);
647 deref_member(member);
648 wait_for_completion(&member->comp);
649 ib_sa_client_put(member->client);
652 EXPORT_SYMBOL(ib_sa_free_multicast);
654 int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
655 union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
657 struct mcast_device *dev;
658 struct mcast_port *port;
659 struct mcast_group *group;
663 dev = ib_get_client_data(device, &mcast_client);
667 port = &dev->port[port_num - dev->start_port];
668 spin_lock_irqsave(&port->lock, flags);
669 group = mcast_find(port, mgid);
673 ret = -EADDRNOTAVAIL;
674 spin_unlock_irqrestore(&port->lock, flags);
678 EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
680 int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
681 struct ib_sa_mcmember_rec *rec,
682 struct ib_ah_attr *ah_attr)
688 ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
692 memset(ah_attr, 0, sizeof *ah_attr);
693 ah_attr->dlid = be16_to_cpu(rec->mlid);
694 ah_attr->sl = rec->sl;
695 ah_attr->port_num = port_num;
696 ah_attr->static_rate = rec->rate;
698 ah_attr->ah_flags = IB_AH_GRH;
699 ah_attr->grh.dgid = rec->mgid;
701 ah_attr->grh.sgid_index = (u8) gid_index;
702 ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
703 ah_attr->grh.hop_limit = rec->hop_limit;
704 ah_attr->grh.traffic_class = rec->traffic_class;
708 EXPORT_SYMBOL(ib_init_ah_from_mcmember);
710 static void mcast_groups_lost(struct mcast_port *port)
712 struct mcast_group *group;
713 struct rb_node *node;
716 spin_lock_irqsave(&port->lock, flags);
717 for (node = rb_first(&port->table); node; node = rb_next(node)) {
718 group = rb_entry(node, struct mcast_group, node);
719 spin_lock(&group->lock);
720 if (group->state == MCAST_IDLE) {
721 atomic_inc(&group->refcount);
722 queue_work(mcast_wq, &group->work);
724 group->state = MCAST_ERROR;
725 spin_unlock(&group->lock);
727 spin_unlock_irqrestore(&port->lock, flags);
730 static void mcast_event_handler(struct ib_event_handler *handler,
731 struct ib_event *event)
733 struct mcast_device *dev;
735 dev = container_of(handler, struct mcast_device, event_handler);
737 switch (event->event) {
738 case IB_EVENT_PORT_ERR:
739 case IB_EVENT_LID_CHANGE:
740 case IB_EVENT_SM_CHANGE:
741 case IB_EVENT_CLIENT_REREGISTER:
742 mcast_groups_lost(&dev->port[event->element.port_num -
750 static void mcast_add_one(struct ib_device *device)
752 struct mcast_device *dev;
753 struct mcast_port *port;
756 if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
759 dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
764 if (device->node_type == RDMA_NODE_IB_SWITCH)
765 dev->start_port = dev->end_port = 0;
768 dev->end_port = device->phys_port_cnt;
771 for (i = 0; i <= dev->end_port - dev->start_port; i++) {
772 port = &dev->port[i];
774 port->port_num = dev->start_port + i;
775 spin_lock_init(&port->lock);
776 port->table = RB_ROOT;
777 init_completion(&port->comp);
778 atomic_set(&port->refcount, 1);
781 dev->device = device;
782 ib_set_client_data(device, &mcast_client, dev);
784 INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
785 ib_register_event_handler(&dev->event_handler);
788 static void mcast_remove_one(struct ib_device *device)
790 struct mcast_device *dev;
791 struct mcast_port *port;
794 dev = ib_get_client_data(device, &mcast_client);
798 ib_unregister_event_handler(&dev->event_handler);
799 flush_workqueue(mcast_wq);
801 for (i = 0; i <= dev->end_port - dev->start_port; i++) {
802 port = &dev->port[i];
804 wait_for_completion(&port->comp);
814 mcast_wq = create_singlethread_workqueue("ib_mcast");
818 ib_sa_register_client(&sa_client);
820 ret = ib_register_client(&mcast_client);
826 ib_sa_unregister_client(&sa_client);
827 destroy_workqueue(mcast_wq);
831 void mcast_cleanup(void)
833 ib_unregister_client(&mcast_client);
834 ib_sa_unregister_client(&sa_client);
835 destroy_workqueue(mcast_wq);