Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/capability.h>
37 #include <linux/errno.h>
38 #include <linux/timer.h>
39 #include <linux/mm.h>
40 #include <linux/kernel.h>
41 #include <linux/fcntl.h>
42 #include <linux/stat.h>
43 #include <linux/socket.h>
44 #include <linux/in.h>
45 #include <linux/inet.h>
46 #include <linux/netdevice.h>
47 #include <linux/inetdevice.h>
48 #include <linux/igmp.h>
49 #include <linux/proc_fs.h>
50 #include <linux/seq_file.h>
51 #include <linux/mroute.h>
52 #include <linux/init.h>
53 #include <linux/if_ether.h>
54 #include <net/ip.h>
55 #include <net/protocol.h>
56 #include <linux/skbuff.h>
57 #include <net/route.h>
58 #include <net/sock.h>
59 #include <net/icmp.h>
60 #include <net/udp.h>
61 #include <net/raw.h>
62 #include <linux/notifier.h>
63 #include <linux/if_arp.h>
64 #include <linux/netfilter_ipv4.h>
65 #include <net/ipip.h>
66 #include <net/checksum.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 static struct sock *mroute_socket;
73
74
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78
79 static DEFINE_RWLOCK(mrt_lock);
80
81 /*
82  *      Multicast router control variables
83  */
84
85 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
86 static int maxvif;
87
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89
90 static int mroute_do_assert;                            /* Set in PIM assert    */
91 static int mroute_do_pim;
92
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
94
95 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
97
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105
106    In this case data path is free of exclusive locks at all.
107  */
108
109 static kmem_cache_t *mrt_cachep __read_mostly;
110
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118
119 static struct timer_list ipmr_expire_timer;
120
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126         struct net_device  *dev;
127
128         dev = __dev_get_by_name("tunl0");
129
130         if (dev) {
131                 int err;
132                 struct ifreq ifr;
133                 mm_segment_t    oldfs;
134                 struct ip_tunnel_parm p;
135                 struct in_device  *in_dev;
136
137                 memset(&p, 0, sizeof(p));
138                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
139                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
140                 p.iph.version = 4;
141                 p.iph.ihl = 5;
142                 p.iph.protocol = IPPROTO_IPIP;
143                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144                 ifr.ifr_ifru.ifru_data = (void*)&p;
145
146                 oldfs = get_fs(); set_fs(KERNEL_DS);
147                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148                 set_fs(oldfs);
149
150                 dev = NULL;
151
152                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
153                         dev->flags |= IFF_MULTICAST;
154
155                         in_dev = __in_dev_get_rtnl(dev);
156                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
157                                 goto failure;
158                         in_dev->cnf.rp_filter = 0;
159
160                         if (dev_open(dev))
161                                 goto failure;
162                 }
163         }
164         return dev;
165
166 failure:
167         /* allow the register to be completed before unregistering. */
168         rtnl_unlock();
169         rtnl_lock();
170
171         unregister_netdevice(dev);
172         return NULL;
173 }
174
175 #ifdef CONFIG_IP_PIMSM
176
177 static int reg_vif_num = -1;
178
179 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
180 {
181         read_lock(&mrt_lock);
182         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
183         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
184         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
185         read_unlock(&mrt_lock);
186         kfree_skb(skb);
187         return 0;
188 }
189
190 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
191 {
192         return (struct net_device_stats*)netdev_priv(dev);
193 }
194
195 static void reg_vif_setup(struct net_device *dev)
196 {
197         dev->type               = ARPHRD_PIMREG;
198         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
199         dev->flags              = IFF_NOARP;
200         dev->hard_start_xmit    = reg_vif_xmit;
201         dev->get_stats          = reg_vif_get_stats;
202         dev->destructor         = free_netdev;
203 }
204
205 static struct net_device *ipmr_reg_vif(void)
206 {
207         struct net_device *dev;
208         struct in_device *in_dev;
209
210         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
211                            reg_vif_setup);
212
213         if (dev == NULL)
214                 return NULL;
215
216         if (register_netdevice(dev)) {
217                 free_netdev(dev);
218                 return NULL;
219         }
220         dev->iflink = 0;
221
222         if ((in_dev = inetdev_init(dev)) == NULL)
223                 goto failure;
224
225         in_dev->cnf.rp_filter = 0;
226
227         if (dev_open(dev))
228                 goto failure;
229
230         return dev;
231
232 failure:
233         /* allow the register to be completed before unregistering. */
234         rtnl_unlock();
235         rtnl_lock();
236
237         unregister_netdevice(dev);
238         return NULL;
239 }
240 #endif
241
242 /*
243  *      Delete a VIF entry
244  */
245  
246 static int vif_delete(int vifi)
247 {
248         struct vif_device *v;
249         struct net_device *dev;
250         struct in_device *in_dev;
251
252         if (vifi < 0 || vifi >= maxvif)
253                 return -EADDRNOTAVAIL;
254
255         v = &vif_table[vifi];
256
257         write_lock_bh(&mrt_lock);
258         dev = v->dev;
259         v->dev = NULL;
260
261         if (!dev) {
262                 write_unlock_bh(&mrt_lock);
263                 return -EADDRNOTAVAIL;
264         }
265
266 #ifdef CONFIG_IP_PIMSM
267         if (vifi == reg_vif_num)
268                 reg_vif_num = -1;
269 #endif
270
271         if (vifi+1 == maxvif) {
272                 int tmp;
273                 for (tmp=vifi-1; tmp>=0; tmp--) {
274                         if (VIF_EXISTS(tmp))
275                                 break;
276                 }
277                 maxvif = tmp+1;
278         }
279
280         write_unlock_bh(&mrt_lock);
281
282         dev_set_allmulti(dev, -1);
283
284         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
285                 in_dev->cnf.mc_forwarding--;
286                 ip_rt_multicast_event(in_dev);
287         }
288
289         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
290                 unregister_netdevice(dev);
291
292         dev_put(dev);
293         return 0;
294 }
295
296 /* Destroy an unresolved cache entry, killing queued skbs
297    and reporting error to netlink readers.
298  */
299
300 static void ipmr_destroy_unres(struct mfc_cache *c)
301 {
302         struct sk_buff *skb;
303         struct nlmsgerr *e;
304
305         atomic_dec(&cache_resolve_queue_len);
306
307         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
308                 if (skb->nh.iph->version == 0) {
309                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
310                         nlh->nlmsg_type = NLMSG_ERROR;
311                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
312                         skb_trim(skb, nlh->nlmsg_len);
313                         e = NLMSG_DATA(nlh);
314                         e->error = -ETIMEDOUT;
315                         memset(&e->msg, 0, sizeof(e->msg));
316                         netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
317                 } else
318                         kfree_skb(skb);
319         }
320
321         kmem_cache_free(mrt_cachep, c);
322 }
323
324
325 /* Single timer process for all the unresolved queue. */
326
327 static void ipmr_expire_process(unsigned long dummy)
328 {
329         unsigned long now;
330         unsigned long expires;
331         struct mfc_cache *c, **cp;
332
333         if (!spin_trylock(&mfc_unres_lock)) {
334                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
335                 return;
336         }
337
338         if (atomic_read(&cache_resolve_queue_len) == 0)
339                 goto out;
340
341         now = jiffies;
342         expires = 10*HZ;
343         cp = &mfc_unres_queue;
344
345         while ((c=*cp) != NULL) {
346                 if (time_after(c->mfc_un.unres.expires, now)) {
347                         unsigned long interval = c->mfc_un.unres.expires - now;
348                         if (interval < expires)
349                                 expires = interval;
350                         cp = &c->next;
351                         continue;
352                 }
353
354                 *cp = c->next;
355
356                 ipmr_destroy_unres(c);
357         }
358
359         if (atomic_read(&cache_resolve_queue_len))
360                 mod_timer(&ipmr_expire_timer, jiffies + expires);
361
362 out:
363         spin_unlock(&mfc_unres_lock);
364 }
365
366 /* Fill oifs list. It is called under write locked mrt_lock. */
367
368 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
369 {
370         int vifi;
371
372         cache->mfc_un.res.minvif = MAXVIFS;
373         cache->mfc_un.res.maxvif = 0;
374         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
375
376         for (vifi=0; vifi<maxvif; vifi++) {
377                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
378                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
379                         if (cache->mfc_un.res.minvif > vifi)
380                                 cache->mfc_un.res.minvif = vifi;
381                         if (cache->mfc_un.res.maxvif <= vifi)
382                                 cache->mfc_un.res.maxvif = vifi + 1;
383                 }
384         }
385 }
386
387 static int vif_add(struct vifctl *vifc, int mrtsock)
388 {
389         int vifi = vifc->vifc_vifi;
390         struct vif_device *v = &vif_table[vifi];
391         struct net_device *dev;
392         struct in_device *in_dev;
393
394         /* Is vif busy ? */
395         if (VIF_EXISTS(vifi))
396                 return -EADDRINUSE;
397
398         switch (vifc->vifc_flags) {
399 #ifdef CONFIG_IP_PIMSM
400         case VIFF_REGISTER:
401                 /*
402                  * Special Purpose VIF in PIM
403                  * All the packets will be sent to the daemon
404                  */
405                 if (reg_vif_num >= 0)
406                         return -EADDRINUSE;
407                 dev = ipmr_reg_vif();
408                 if (!dev)
409                         return -ENOBUFS;
410                 break;
411 #endif
412         case VIFF_TUNNEL:       
413                 dev = ipmr_new_tunnel(vifc);
414                 if (!dev)
415                         return -ENOBUFS;
416                 break;
417         case 0:
418                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
419                 if (!dev)
420                         return -EADDRNOTAVAIL;
421                 dev_put(dev);
422                 break;
423         default:
424                 return -EINVAL;
425         }
426
427         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
428                 return -EADDRNOTAVAIL;
429         in_dev->cnf.mc_forwarding++;
430         dev_set_allmulti(dev, +1);
431         ip_rt_multicast_event(in_dev);
432
433         /*
434          *      Fill in the VIF structures
435          */
436         v->rate_limit=vifc->vifc_rate_limit;
437         v->local=vifc->vifc_lcl_addr.s_addr;
438         v->remote=vifc->vifc_rmt_addr.s_addr;
439         v->flags=vifc->vifc_flags;
440         if (!mrtsock)
441                 v->flags |= VIFF_STATIC;
442         v->threshold=vifc->vifc_threshold;
443         v->bytes_in = 0;
444         v->bytes_out = 0;
445         v->pkt_in = 0;
446         v->pkt_out = 0;
447         v->link = dev->ifindex;
448         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
449                 v->link = dev->iflink;
450
451         /* And finish update writing critical data */
452         write_lock_bh(&mrt_lock);
453         dev_hold(dev);
454         v->dev=dev;
455 #ifdef CONFIG_IP_PIMSM
456         if (v->flags&VIFF_REGISTER)
457                 reg_vif_num = vifi;
458 #endif
459         if (vifi+1 > maxvif)
460                 maxvif = vifi+1;
461         write_unlock_bh(&mrt_lock);
462         return 0;
463 }
464
465 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
466 {
467         int line=MFC_HASH(mcastgrp,origin);
468         struct mfc_cache *c;
469
470         for (c=mfc_cache_array[line]; c; c = c->next) {
471                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472                         break;
473         }
474         return c;
475 }
476
477 /*
478  *      Allocate a multicast cache entry
479  */
480 static struct mfc_cache *ipmr_cache_alloc(void)
481 {
482         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
483         if(c==NULL)
484                 return NULL;
485         memset(c, 0, sizeof(*c));
486         c->mfc_un.res.minvif = MAXVIFS;
487         return c;
488 }
489
490 static struct mfc_cache *ipmr_cache_alloc_unres(void)
491 {
492         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
493         if(c==NULL)
494                 return NULL;
495         memset(c, 0, sizeof(*c));
496         skb_queue_head_init(&c->mfc_un.unres.unresolved);
497         c->mfc_un.unres.expires = jiffies + 10*HZ;
498         return c;
499 }
500
501 /*
502  *      A cache entry has gone into a resolved state from queued
503  */
504  
505 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
506 {
507         struct sk_buff *skb;
508         struct nlmsgerr *e;
509
510         /*
511          *      Play the pending entries through our router
512          */
513
514         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
515                 if (skb->nh.iph->version == 0) {
516                         int err;
517                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
518
519                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
520                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
521                         } else {
522                                 nlh->nlmsg_type = NLMSG_ERROR;
523                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
524                                 skb_trim(skb, nlh->nlmsg_len);
525                                 e = NLMSG_DATA(nlh);
526                                 e->error = -EMSGSIZE;
527                                 memset(&e->msg, 0, sizeof(e->msg));
528                         }
529                         err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
530                 } else
531                         ip_mr_forward(skb, c, 0);
532         }
533 }
534
535 /*
536  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
537  *      expects the following bizarre scheme.
538  *
539  *      Called under mrt_lock.
540  */
541  
542 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
543 {
544         struct sk_buff *skb;
545         int ihl = pkt->nh.iph->ihl<<2;
546         struct igmphdr *igmp;
547         struct igmpmsg *msg;
548         int ret;
549
550 #ifdef CONFIG_IP_PIMSM
551         if (assert == IGMPMSG_WHOLEPKT)
552                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
553         else
554 #endif
555                 skb = alloc_skb(128, GFP_ATOMIC);
556
557         if(!skb)
558                 return -ENOBUFS;
559
560 #ifdef CONFIG_IP_PIMSM
561         if (assert == IGMPMSG_WHOLEPKT) {
562                 /* Ugly, but we have no choice with this interface.
563                    Duplicate old header, fix ihl, length etc.
564                    And all this only to mangle msg->im_msgtype and
565                    to set msg->im_mbz to "mbz" :-)
566                  */
567                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
568                 skb->nh.raw = skb->h.raw = (u8*)msg;
569                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
570                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
571                 msg->im_mbz = 0;
572                 msg->im_vif = reg_vif_num;
573                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
574                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
575         } else 
576 #endif
577         {       
578                 
579         /*
580          *      Copy the IP header
581          */
582
583         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
584         memcpy(skb->data,pkt->data,ihl);
585         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
586         msg = (struct igmpmsg*)skb->nh.iph;
587         msg->im_vif = vifi;
588         skb->dst = dst_clone(pkt->dst);
589
590         /*
591          *      Add our header
592          */
593
594         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
595         igmp->type      =
596         msg->im_msgtype = assert;
597         igmp->code      =       0;
598         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
599         skb->h.raw = skb->nh.raw;
600         }
601
602         if (mroute_socket == NULL) {
603                 kfree_skb(skb);
604                 return -EINVAL;
605         }
606
607         /*
608          *      Deliver to mrouted
609          */
610         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
611                 if (net_ratelimit())
612                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
613                 kfree_skb(skb);
614         }
615
616         return ret;
617 }
618
619 /*
620  *      Queue a packet for resolution. It gets locked cache entry!
621  */
622  
623 static int
624 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
625 {
626         int err;
627         struct mfc_cache *c;
628
629         spin_lock_bh(&mfc_unres_lock);
630         for (c=mfc_unres_queue; c; c=c->next) {
631                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
632                     c->mfc_origin == skb->nh.iph->saddr)
633                         break;
634         }
635
636         if (c == NULL) {
637                 /*
638                  *      Create a new entry if allowable
639                  */
640
641                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
642                     (c=ipmr_cache_alloc_unres())==NULL) {
643                         spin_unlock_bh(&mfc_unres_lock);
644
645                         kfree_skb(skb);
646                         return -ENOBUFS;
647                 }
648
649                 /*
650                  *      Fill in the new cache entry
651                  */
652                 c->mfc_parent=-1;
653                 c->mfc_origin=skb->nh.iph->saddr;
654                 c->mfc_mcastgrp=skb->nh.iph->daddr;
655
656                 /*
657                  *      Reflect first query at mrouted.
658                  */
659                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
660                         /* If the report failed throw the cache entry 
661                            out - Brad Parker
662                          */
663                         spin_unlock_bh(&mfc_unres_lock);
664
665                         kmem_cache_free(mrt_cachep, c);
666                         kfree_skb(skb);
667                         return err;
668                 }
669
670                 atomic_inc(&cache_resolve_queue_len);
671                 c->next = mfc_unres_queue;
672                 mfc_unres_queue = c;
673
674                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
675         }
676
677         /*
678          *      See if we can append the packet
679          */
680         if (c->mfc_un.unres.unresolved.qlen>3) {
681                 kfree_skb(skb);
682                 err = -ENOBUFS;
683         } else {
684                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
685                 err = 0;
686         }
687
688         spin_unlock_bh(&mfc_unres_lock);
689         return err;
690 }
691
692 /*
693  *      MFC cache manipulation by user space mroute daemon
694  */
695
696 static int ipmr_mfc_delete(struct mfcctl *mfc)
697 {
698         int line;
699         struct mfc_cache *c, **cp;
700
701         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
702
703         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
704                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
705                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
706                         write_lock_bh(&mrt_lock);
707                         *cp = c->next;
708                         write_unlock_bh(&mrt_lock);
709
710                         kmem_cache_free(mrt_cachep, c);
711                         return 0;
712                 }
713         }
714         return -ENOENT;
715 }
716
717 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
718 {
719         int line;
720         struct mfc_cache *uc, *c, **cp;
721
722         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
723
724         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
725                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
726                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
727                         break;
728         }
729
730         if (c != NULL) {
731                 write_lock_bh(&mrt_lock);
732                 c->mfc_parent = mfc->mfcc_parent;
733                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
734                 if (!mrtsock)
735                         c->mfc_flags |= MFC_STATIC;
736                 write_unlock_bh(&mrt_lock);
737                 return 0;
738         }
739
740         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
741                 return -EINVAL;
742
743         c=ipmr_cache_alloc();
744         if (c==NULL)
745                 return -ENOMEM;
746
747         c->mfc_origin=mfc->mfcc_origin.s_addr;
748         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
749         c->mfc_parent=mfc->mfcc_parent;
750         ipmr_update_thresholds(c, mfc->mfcc_ttls);
751         if (!mrtsock)
752                 c->mfc_flags |= MFC_STATIC;
753
754         write_lock_bh(&mrt_lock);
755         c->next = mfc_cache_array[line];
756         mfc_cache_array[line] = c;
757         write_unlock_bh(&mrt_lock);
758
759         /*
760          *      Check to see if we resolved a queued list. If so we
761          *      need to send on the frames and tidy up.
762          */
763         spin_lock_bh(&mfc_unres_lock);
764         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
765              cp = &uc->next) {
766                 if (uc->mfc_origin == c->mfc_origin &&
767                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
768                         *cp = uc->next;
769                         if (atomic_dec_and_test(&cache_resolve_queue_len))
770                                 del_timer(&ipmr_expire_timer);
771                         break;
772                 }
773         }
774         spin_unlock_bh(&mfc_unres_lock);
775
776         if (uc) {
777                 ipmr_cache_resolve(uc, c);
778                 kmem_cache_free(mrt_cachep, uc);
779         }
780         return 0;
781 }
782
783 /*
784  *      Close the multicast socket, and clear the vif tables etc
785  */
786  
787 static void mroute_clean_tables(struct sock *sk)
788 {
789         int i;
790                 
791         /*
792          *      Shut down all active vif entries
793          */
794         for(i=0; i<maxvif; i++) {
795                 if (!(vif_table[i].flags&VIFF_STATIC))
796                         vif_delete(i);
797         }
798
799         /*
800          *      Wipe the cache
801          */
802         for (i=0;i<MFC_LINES;i++) {
803                 struct mfc_cache *c, **cp;
804
805                 cp = &mfc_cache_array[i];
806                 while ((c = *cp) != NULL) {
807                         if (c->mfc_flags&MFC_STATIC) {
808                                 cp = &c->next;
809                                 continue;
810                         }
811                         write_lock_bh(&mrt_lock);
812                         *cp = c->next;
813                         write_unlock_bh(&mrt_lock);
814
815                         kmem_cache_free(mrt_cachep, c);
816                 }
817         }
818
819         if (atomic_read(&cache_resolve_queue_len) != 0) {
820                 struct mfc_cache *c;
821
822                 spin_lock_bh(&mfc_unres_lock);
823                 while (mfc_unres_queue != NULL) {
824                         c = mfc_unres_queue;
825                         mfc_unres_queue = c->next;
826                         spin_unlock_bh(&mfc_unres_lock);
827
828                         ipmr_destroy_unres(c);
829
830                         spin_lock_bh(&mfc_unres_lock);
831                 }
832                 spin_unlock_bh(&mfc_unres_lock);
833         }
834 }
835
836 static void mrtsock_destruct(struct sock *sk)
837 {
838         rtnl_lock();
839         if (sk == mroute_socket) {
840                 ipv4_devconf.mc_forwarding--;
841
842                 write_lock_bh(&mrt_lock);
843                 mroute_socket=NULL;
844                 write_unlock_bh(&mrt_lock);
845
846                 mroute_clean_tables(sk);
847         }
848         rtnl_unlock();
849 }
850
851 /*
852  *      Socket options and virtual interface manipulation. The whole
853  *      virtual interface system is a complete heap, but unfortunately
854  *      that's how BSD mrouted happens to think. Maybe one day with a proper
855  *      MOSPF/PIM router set up we can clean this up.
856  */
857  
858 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
859 {
860         int ret;
861         struct vifctl vif;
862         struct mfcctl mfc;
863         
864         if(optname!=MRT_INIT)
865         {
866                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
867                         return -EACCES;
868         }
869
870         switch(optname)
871         {
872                 case MRT_INIT:
873                         if (sk->sk_type != SOCK_RAW ||
874                             inet_sk(sk)->num != IPPROTO_IGMP)
875                                 return -EOPNOTSUPP;
876                         if(optlen!=sizeof(int))
877                                 return -ENOPROTOOPT;
878
879                         rtnl_lock();
880                         if (mroute_socket) {
881                                 rtnl_unlock();
882                                 return -EADDRINUSE;
883                         }
884
885                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
886                         if (ret == 0) {
887                                 write_lock_bh(&mrt_lock);
888                                 mroute_socket=sk;
889                                 write_unlock_bh(&mrt_lock);
890
891                                 ipv4_devconf.mc_forwarding++;
892                         }
893                         rtnl_unlock();
894                         return ret;
895                 case MRT_DONE:
896                         if (sk!=mroute_socket)
897                                 return -EACCES;
898                         return ip_ra_control(sk, 0, NULL);
899                 case MRT_ADD_VIF:
900                 case MRT_DEL_VIF:
901                         if(optlen!=sizeof(vif))
902                                 return -EINVAL;
903                         if (copy_from_user(&vif,optval,sizeof(vif)))
904                                 return -EFAULT; 
905                         if(vif.vifc_vifi >= MAXVIFS)
906                                 return -ENFILE;
907                         rtnl_lock();
908                         if (optname==MRT_ADD_VIF) {
909                                 ret = vif_add(&vif, sk==mroute_socket);
910                         } else {
911                                 ret = vif_delete(vif.vifc_vifi);
912                         }
913                         rtnl_unlock();
914                         return ret;
915
916                 /*
917                  *      Manipulate the forwarding caches. These live
918                  *      in a sort of kernel/user symbiosis.
919                  */
920                 case MRT_ADD_MFC:
921                 case MRT_DEL_MFC:
922                         if(optlen!=sizeof(mfc))
923                                 return -EINVAL;
924                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
925                                 return -EFAULT;
926                         rtnl_lock();
927                         if (optname==MRT_DEL_MFC)
928                                 ret = ipmr_mfc_delete(&mfc);
929                         else
930                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
931                         rtnl_unlock();
932                         return ret;
933                 /*
934                  *      Control PIM assert.
935                  */
936                 case MRT_ASSERT:
937                 {
938                         int v;
939                         if(get_user(v,(int __user *)optval))
940                                 return -EFAULT;
941                         mroute_do_assert=(v)?1:0;
942                         return 0;
943                 }
944 #ifdef CONFIG_IP_PIMSM
945                 case MRT_PIM:
946                 {
947                         int v, ret;
948                         if(get_user(v,(int __user *)optval))
949                                 return -EFAULT;
950                         v = (v)?1:0;
951                         rtnl_lock();
952                         ret = 0;
953                         if (v != mroute_do_pim) {
954                                 mroute_do_pim = v;
955                                 mroute_do_assert = v;
956 #ifdef CONFIG_IP_PIMSM_V2
957                                 if (mroute_do_pim)
958                                         ret = inet_add_protocol(&pim_protocol,
959                                                                 IPPROTO_PIM);
960                                 else
961                                         ret = inet_del_protocol(&pim_protocol,
962                                                                 IPPROTO_PIM);
963                                 if (ret < 0)
964                                         ret = -EAGAIN;
965 #endif
966                         }
967                         rtnl_unlock();
968                         return ret;
969                 }
970 #endif
971                 /*
972                  *      Spurious command, or MRT_VERSION which you cannot
973                  *      set.
974                  */
975                 default:
976                         return -ENOPROTOOPT;
977         }
978 }
979
980 /*
981  *      Getsock opt support for the multicast routing system.
982  */
983  
984 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
985 {
986         int olr;
987         int val;
988
989         if(optname!=MRT_VERSION && 
990 #ifdef CONFIG_IP_PIMSM
991            optname!=MRT_PIM &&
992 #endif
993            optname!=MRT_ASSERT)
994                 return -ENOPROTOOPT;
995
996         if (get_user(olr, optlen))
997                 return -EFAULT;
998
999         olr = min_t(unsigned int, olr, sizeof(int));
1000         if (olr < 0)
1001                 return -EINVAL;
1002                 
1003         if(put_user(olr,optlen))
1004                 return -EFAULT;
1005         if(optname==MRT_VERSION)
1006                 val=0x0305;
1007 #ifdef CONFIG_IP_PIMSM
1008         else if(optname==MRT_PIM)
1009                 val=mroute_do_pim;
1010 #endif
1011         else
1012                 val=mroute_do_assert;
1013         if(copy_to_user(optval,&val,olr))
1014                 return -EFAULT;
1015         return 0;
1016 }
1017
1018 /*
1019  *      The IP multicast ioctl support routines.
1020  */
1021  
1022 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1023 {
1024         struct sioc_sg_req sr;
1025         struct sioc_vif_req vr;
1026         struct vif_device *vif;
1027         struct mfc_cache *c;
1028         
1029         switch(cmd)
1030         {
1031                 case SIOCGETVIFCNT:
1032                         if (copy_from_user(&vr,arg,sizeof(vr)))
1033                                 return -EFAULT; 
1034                         if(vr.vifi>=maxvif)
1035                                 return -EINVAL;
1036                         read_lock(&mrt_lock);
1037                         vif=&vif_table[vr.vifi];
1038                         if(VIF_EXISTS(vr.vifi)) {
1039                                 vr.icount=vif->pkt_in;
1040                                 vr.ocount=vif->pkt_out;
1041                                 vr.ibytes=vif->bytes_in;
1042                                 vr.obytes=vif->bytes_out;
1043                                 read_unlock(&mrt_lock);
1044
1045                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1046                                         return -EFAULT;
1047                                 return 0;
1048                         }
1049                         read_unlock(&mrt_lock);
1050                         return -EADDRNOTAVAIL;
1051                 case SIOCGETSGCNT:
1052                         if (copy_from_user(&sr,arg,sizeof(sr)))
1053                                 return -EFAULT;
1054
1055                         read_lock(&mrt_lock);
1056                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1057                         if (c) {
1058                                 sr.pktcnt = c->mfc_un.res.pkt;
1059                                 sr.bytecnt = c->mfc_un.res.bytes;
1060                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1061                                 read_unlock(&mrt_lock);
1062
1063                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1064                                         return -EFAULT;
1065                                 return 0;
1066                         }
1067                         read_unlock(&mrt_lock);
1068                         return -EADDRNOTAVAIL;
1069                 default:
1070                         return -ENOIOCTLCMD;
1071         }
1072 }
1073
1074
1075 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1076 {
1077         struct vif_device *v;
1078         int ct;
1079         if (event != NETDEV_UNREGISTER)
1080                 return NOTIFY_DONE;
1081         v=&vif_table[0];
1082         for(ct=0;ct<maxvif;ct++,v++) {
1083                 if (v->dev==ptr)
1084                         vif_delete(ct);
1085         }
1086         return NOTIFY_DONE;
1087 }
1088
1089
1090 static struct notifier_block ip_mr_notifier={
1091         .notifier_call = ipmr_device_event,
1092 };
1093
1094 /*
1095  *      Encapsulate a packet by attaching a valid IPIP header to it.
1096  *      This avoids tunnel drivers and other mess and gives us the speed so
1097  *      important for multicast video.
1098  */
1099  
1100 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1101 {
1102         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1103
1104         iph->version    =       4;
1105         iph->tos        =       skb->nh.iph->tos;
1106         iph->ttl        =       skb->nh.iph->ttl;
1107         iph->frag_off   =       0;
1108         iph->daddr      =       daddr;
1109         iph->saddr      =       saddr;
1110         iph->protocol   =       IPPROTO_IPIP;
1111         iph->ihl        =       5;
1112         iph->tot_len    =       htons(skb->len);
1113         ip_select_ident(iph, skb->dst, NULL);
1114         ip_send_check(iph);
1115
1116         skb->h.ipiph = skb->nh.iph;
1117         skb->nh.iph = iph;
1118         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1119         nf_reset(skb);
1120 }
1121
1122 static inline int ipmr_forward_finish(struct sk_buff *skb)
1123 {
1124         struct ip_options * opt = &(IPCB(skb)->opt);
1125
1126         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1127
1128         if (unlikely(opt->optlen))
1129                 ip_forward_options(skb);
1130
1131         return dst_output(skb);
1132 }
1133
1134 /*
1135  *      Processing handlers for ipmr_forward
1136  */
1137
1138 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1139 {
1140         struct iphdr *iph = skb->nh.iph;
1141         struct vif_device *vif = &vif_table[vifi];
1142         struct net_device *dev;
1143         struct rtable *rt;
1144         int    encap = 0;
1145
1146         if (vif->dev == NULL)
1147                 goto out_free;
1148
1149 #ifdef CONFIG_IP_PIMSM
1150         if (vif->flags & VIFF_REGISTER) {
1151                 vif->pkt_out++;
1152                 vif->bytes_out+=skb->len;
1153                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1154                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1155                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1156                 kfree_skb(skb);
1157                 return;
1158         }
1159 #endif
1160
1161         if (vif->flags&VIFF_TUNNEL) {
1162                 struct flowi fl = { .oif = vif->link,
1163                                     .nl_u = { .ip4_u =
1164                                               { .daddr = vif->remote,
1165                                                 .saddr = vif->local,
1166                                                 .tos = RT_TOS(iph->tos) } },
1167                                     .proto = IPPROTO_IPIP };
1168                 if (ip_route_output_key(&rt, &fl))
1169                         goto out_free;
1170                 encap = sizeof(struct iphdr);
1171         } else {
1172                 struct flowi fl = { .oif = vif->link,
1173                                     .nl_u = { .ip4_u =
1174                                               { .daddr = iph->daddr,
1175                                                 .tos = RT_TOS(iph->tos) } },
1176                                     .proto = IPPROTO_IPIP };
1177                 if (ip_route_output_key(&rt, &fl))
1178                         goto out_free;
1179         }
1180
1181         dev = rt->u.dst.dev;
1182
1183         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1184                 /* Do not fragment multicasts. Alas, IPv4 does not
1185                    allow to send ICMP, so that packets will disappear
1186                    to blackhole.
1187                  */
1188
1189                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1190                 ip_rt_put(rt);
1191                 goto out_free;
1192         }
1193
1194         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1195
1196         if (skb_cow(skb, encap)) {
1197                 ip_rt_put(rt);
1198                 goto out_free;
1199         }
1200
1201         vif->pkt_out++;
1202         vif->bytes_out+=skb->len;
1203
1204         dst_release(skb->dst);
1205         skb->dst = &rt->u.dst;
1206         iph = skb->nh.iph;
1207         ip_decrease_ttl(iph);
1208
1209         /* FIXME: forward and output firewalls used to be called here.
1210          * What do we do with netfilter? -- RR */
1211         if (vif->flags & VIFF_TUNNEL) {
1212                 ip_encap(skb, vif->local, vif->remote);
1213                 /* FIXME: extra output firewall step used to be here. --RR */
1214                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1215                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1216         }
1217
1218         IPCB(skb)->flags |= IPSKB_FORWARDED;
1219
1220         /*
1221          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1222          * not only before forwarding, but after forwarding on all output
1223          * interfaces. It is clear, if mrouter runs a multicasting
1224          * program, it should receive packets not depending to what interface
1225          * program is joined.
1226          * If we will not make it, the program will have to join on all
1227          * interfaces. On the other hand, multihoming host (or router, but
1228          * not mrouter) cannot join to more than one interface - it will
1229          * result in receiving multiple packets.
1230          */
1231         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1232                 ipmr_forward_finish);
1233         return;
1234
1235 out_free:
1236         kfree_skb(skb);
1237         return;
1238 }
1239
1240 static int ipmr_find_vif(struct net_device *dev)
1241 {
1242         int ct;
1243         for (ct=maxvif-1; ct>=0; ct--) {
1244                 if (vif_table[ct].dev == dev)
1245                         break;
1246         }
1247         return ct;
1248 }
1249
1250 /* "local" means that we should preserve one skb (for local delivery) */
1251
1252 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1253 {
1254         int psend = -1;
1255         int vif, ct;
1256
1257         vif = cache->mfc_parent;
1258         cache->mfc_un.res.pkt++;
1259         cache->mfc_un.res.bytes += skb->len;
1260
1261         /*
1262          * Wrong interface: drop packet and (maybe) send PIM assert.
1263          */
1264         if (vif_table[vif].dev != skb->dev) {
1265                 int true_vifi;
1266
1267                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1268                         /* It is our own packet, looped back.
1269                            Very complicated situation...
1270
1271                            The best workaround until routing daemons will be
1272                            fixed is not to redistribute packet, if it was
1273                            send through wrong interface. It means, that
1274                            multicast applications WILL NOT work for
1275                            (S,G), which have default multicast route pointing
1276                            to wrong oif. In any case, it is not a good
1277                            idea to use multicasting applications on router.
1278                          */
1279                         goto dont_forward;
1280                 }
1281
1282                 cache->mfc_un.res.wrong_if++;
1283                 true_vifi = ipmr_find_vif(skb->dev);
1284
1285                 if (true_vifi >= 0 && mroute_do_assert &&
1286                     /* pimsm uses asserts, when switching from RPT to SPT,
1287                        so that we cannot check that packet arrived on an oif.
1288                        It is bad, but otherwise we would need to move pretty
1289                        large chunk of pimd to kernel. Ough... --ANK
1290                      */
1291                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1292                     time_after(jiffies, 
1293                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1294                         cache->mfc_un.res.last_assert = jiffies;
1295                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1296                 }
1297                 goto dont_forward;
1298         }
1299
1300         vif_table[vif].pkt_in++;
1301         vif_table[vif].bytes_in+=skb->len;
1302
1303         /*
1304          *      Forward the frame
1305          */
1306         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1307                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1308                         if (psend != -1) {
1309                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1310                                 if (skb2)
1311                                         ipmr_queue_xmit(skb2, cache, psend);
1312                         }
1313                         psend=ct;
1314                 }
1315         }
1316         if (psend != -1) {
1317                 if (local) {
1318                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1319                         if (skb2)
1320                                 ipmr_queue_xmit(skb2, cache, psend);
1321                 } else {
1322                         ipmr_queue_xmit(skb, cache, psend);
1323                         return 0;
1324                 }
1325         }
1326
1327 dont_forward:
1328         if (!local)
1329                 kfree_skb(skb);
1330         return 0;
1331 }
1332
1333
1334 /*
1335  *      Multicast packets for forwarding arrive here
1336  */
1337
1338 int ip_mr_input(struct sk_buff *skb)
1339 {
1340         struct mfc_cache *cache;
1341         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1342
1343         /* Packet is looped back after forward, it should not be
1344            forwarded second time, but still can be delivered locally.
1345          */
1346         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1347                 goto dont_forward;
1348
1349         if (!local) {
1350                     if (IPCB(skb)->opt.router_alert) {
1351                             if (ip_call_ra_chain(skb))
1352                                     return 0;
1353                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1354                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1355                                Cisco IOS <= 11.2(8)) do not put router alert
1356                                option to IGMP packets destined to routable
1357                                groups. It is very bad, because it means
1358                                that we can forward NO IGMP messages.
1359                              */
1360                             read_lock(&mrt_lock);
1361                             if (mroute_socket) {
1362                                     nf_reset(skb);
1363                                     raw_rcv(mroute_socket, skb);
1364                                     read_unlock(&mrt_lock);
1365                                     return 0;
1366                             }
1367                             read_unlock(&mrt_lock);
1368                     }
1369         }
1370
1371         read_lock(&mrt_lock);
1372         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1373
1374         /*
1375          *      No usable cache entry
1376          */
1377         if (cache==NULL) {
1378                 int vif;
1379
1380                 if (local) {
1381                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1382                         ip_local_deliver(skb);
1383                         if (skb2 == NULL) {
1384                                 read_unlock(&mrt_lock);
1385                                 return -ENOBUFS;
1386                         }
1387                         skb = skb2;
1388                 }
1389
1390                 vif = ipmr_find_vif(skb->dev);
1391                 if (vif >= 0) {
1392                         int err = ipmr_cache_unresolved(vif, skb);
1393                         read_unlock(&mrt_lock);
1394
1395                         return err;
1396                 }
1397                 read_unlock(&mrt_lock);
1398                 kfree_skb(skb);
1399                 return -ENODEV;
1400         }
1401
1402         ip_mr_forward(skb, cache, local);
1403
1404         read_unlock(&mrt_lock);
1405
1406         if (local)
1407                 return ip_local_deliver(skb);
1408
1409         return 0;
1410
1411 dont_forward:
1412         if (local)
1413                 return ip_local_deliver(skb);
1414         kfree_skb(skb);
1415         return 0;
1416 }
1417
1418 #ifdef CONFIG_IP_PIMSM_V1
1419 /*
1420  * Handle IGMP messages of PIMv1
1421  */
1422
1423 int pim_rcv_v1(struct sk_buff * skb)
1424 {
1425         struct igmphdr *pim;
1426         struct iphdr   *encap;
1427         struct net_device  *reg_dev = NULL;
1428
1429         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1430                 goto drop;
1431
1432         pim = (struct igmphdr*)skb->h.raw;
1433
1434         if (!mroute_do_pim ||
1435             skb->len < sizeof(*pim) + sizeof(*encap) ||
1436             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1437                 goto drop;
1438
1439         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1440         /*
1441            Check that:
1442            a. packet is really destinted to a multicast group
1443            b. packet is not a NULL-REGISTER
1444            c. packet is not truncated
1445          */
1446         if (!MULTICAST(encap->daddr) ||
1447             encap->tot_len == 0 ||
1448             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1449                 goto drop;
1450
1451         read_lock(&mrt_lock);
1452         if (reg_vif_num >= 0)
1453                 reg_dev = vif_table[reg_vif_num].dev;
1454         if (reg_dev)
1455                 dev_hold(reg_dev);
1456         read_unlock(&mrt_lock);
1457
1458         if (reg_dev == NULL) 
1459                 goto drop;
1460
1461         skb->mac.raw = skb->nh.raw;
1462         skb_pull(skb, (u8*)encap - skb->data);
1463         skb->nh.iph = (struct iphdr *)skb->data;
1464         skb->dev = reg_dev;
1465         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1466         skb->protocol = htons(ETH_P_IP);
1467         skb->ip_summed = 0;
1468         skb->pkt_type = PACKET_HOST;
1469         dst_release(skb->dst);
1470         skb->dst = NULL;
1471         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1472         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1473         nf_reset(skb);
1474         netif_rx(skb);
1475         dev_put(reg_dev);
1476         return 0;
1477  drop:
1478         kfree_skb(skb);
1479         return 0;
1480 }
1481 #endif
1482
1483 #ifdef CONFIG_IP_PIMSM_V2
1484 static int pim_rcv(struct sk_buff * skb)
1485 {
1486         struct pimreghdr *pim;
1487         struct iphdr   *encap;
1488         struct net_device  *reg_dev = NULL;
1489
1490         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1491                 goto drop;
1492
1493         pim = (struct pimreghdr*)skb->h.raw;
1494         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1495             (pim->flags&PIM_NULL_REGISTER) ||
1496             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1497              (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
1498                 goto drop;
1499
1500         /* check if the inner packet is destined to mcast group */
1501         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1502         if (!MULTICAST(encap->daddr) ||
1503             encap->tot_len == 0 ||
1504             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1505                 goto drop;
1506
1507         read_lock(&mrt_lock);
1508         if (reg_vif_num >= 0)
1509                 reg_dev = vif_table[reg_vif_num].dev;
1510         if (reg_dev)
1511                 dev_hold(reg_dev);
1512         read_unlock(&mrt_lock);
1513
1514         if (reg_dev == NULL) 
1515                 goto drop;
1516
1517         skb->mac.raw = skb->nh.raw;
1518         skb_pull(skb, (u8*)encap - skb->data);
1519         skb->nh.iph = (struct iphdr *)skb->data;
1520         skb->dev = reg_dev;
1521         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1522         skb->protocol = htons(ETH_P_IP);
1523         skb->ip_summed = 0;
1524         skb->pkt_type = PACKET_HOST;
1525         dst_release(skb->dst);
1526         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1527         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1528         skb->dst = NULL;
1529         nf_reset(skb);
1530         netif_rx(skb);
1531         dev_put(reg_dev);
1532         return 0;
1533  drop:
1534         kfree_skb(skb);
1535         return 0;
1536 }
1537 #endif
1538
1539 static int
1540 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1541 {
1542         int ct;
1543         struct rtnexthop *nhp;
1544         struct net_device *dev = vif_table[c->mfc_parent].dev;
1545         u8 *b = skb->tail;
1546         struct rtattr *mp_head;
1547
1548         if (dev)
1549                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1550
1551         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1552
1553         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1554                 if (c->mfc_un.res.ttls[ct] < 255) {
1555                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1556                                 goto rtattr_failure;
1557                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1558                         nhp->rtnh_flags = 0;
1559                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1560                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1561                         nhp->rtnh_len = sizeof(*nhp);
1562                 }
1563         }
1564         mp_head->rta_type = RTA_MULTIPATH;
1565         mp_head->rta_len = skb->tail - (u8*)mp_head;
1566         rtm->rtm_type = RTN_MULTICAST;
1567         return 1;
1568
1569 rtattr_failure:
1570         skb_trim(skb, b - skb->data);
1571         return -EMSGSIZE;
1572 }
1573
1574 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1575 {
1576         int err;
1577         struct mfc_cache *cache;
1578         struct rtable *rt = (struct rtable*)skb->dst;
1579
1580         read_lock(&mrt_lock);
1581         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1582
1583         if (cache==NULL) {
1584                 struct net_device *dev;
1585                 int vif;
1586
1587                 if (nowait) {
1588                         read_unlock(&mrt_lock);
1589                         return -EAGAIN;
1590                 }
1591
1592                 dev = skb->dev;
1593                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1594                         read_unlock(&mrt_lock);
1595                         return -ENODEV;
1596                 }
1597                 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1598                 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1599                 skb->nh.iph->saddr = rt->rt_src;
1600                 skb->nh.iph->daddr = rt->rt_dst;
1601                 skb->nh.iph->version = 0;
1602                 err = ipmr_cache_unresolved(vif, skb);
1603                 read_unlock(&mrt_lock);
1604                 return err;
1605         }
1606
1607         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1608                 cache->mfc_flags |= MFC_NOTIFY;
1609         err = ipmr_fill_mroute(skb, cache, rtm);
1610         read_unlock(&mrt_lock);
1611         return err;
1612 }
1613
1614 #ifdef CONFIG_PROC_FS   
1615 /*
1616  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1617  */
1618 struct ipmr_vif_iter {
1619         int ct;
1620 };
1621
1622 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1623                                            loff_t pos)
1624 {
1625         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1626                 if(!VIF_EXISTS(iter->ct))
1627                         continue;
1628                 if (pos-- == 0) 
1629                         return &vif_table[iter->ct];
1630         }
1631         return NULL;
1632 }
1633
1634 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1635 {
1636         read_lock(&mrt_lock);
1637         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1638                 : SEQ_START_TOKEN;
1639 }
1640
1641 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1642 {
1643         struct ipmr_vif_iter *iter = seq->private;
1644
1645         ++*pos;
1646         if (v == SEQ_START_TOKEN)
1647                 return ipmr_vif_seq_idx(iter, 0);
1648         
1649         while (++iter->ct < maxvif) {
1650                 if(!VIF_EXISTS(iter->ct))
1651                         continue;
1652                 return &vif_table[iter->ct];
1653         }
1654         return NULL;
1655 }
1656
1657 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1658 {
1659         read_unlock(&mrt_lock);
1660 }
1661
1662 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1663 {
1664         if (v == SEQ_START_TOKEN) {
1665                 seq_puts(seq, 
1666                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1667         } else {
1668                 const struct vif_device *vif = v;
1669                 const char *name =  vif->dev ? vif->dev->name : "none";
1670
1671                 seq_printf(seq,
1672                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1673                            vif - vif_table,
1674                            name, vif->bytes_in, vif->pkt_in, 
1675                            vif->bytes_out, vif->pkt_out,
1676                            vif->flags, vif->local, vif->remote);
1677         }
1678         return 0;
1679 }
1680
1681 static struct seq_operations ipmr_vif_seq_ops = {
1682         .start = ipmr_vif_seq_start,
1683         .next  = ipmr_vif_seq_next,
1684         .stop  = ipmr_vif_seq_stop,
1685         .show  = ipmr_vif_seq_show,
1686 };
1687
1688 static int ipmr_vif_open(struct inode *inode, struct file *file)
1689 {
1690         struct seq_file *seq;
1691         int rc = -ENOMEM;
1692         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1693        
1694         if (!s)
1695                 goto out;
1696
1697         rc = seq_open(file, &ipmr_vif_seq_ops);
1698         if (rc)
1699                 goto out_kfree;
1700
1701         s->ct = 0;
1702         seq = file->private_data;
1703         seq->private = s;
1704 out:
1705         return rc;
1706 out_kfree:
1707         kfree(s);
1708         goto out;
1709
1710 }
1711
1712 static struct file_operations ipmr_vif_fops = {
1713         .owner   = THIS_MODULE,
1714         .open    = ipmr_vif_open,
1715         .read    = seq_read,
1716         .llseek  = seq_lseek,
1717         .release = seq_release_private,
1718 };
1719
1720 struct ipmr_mfc_iter {
1721         struct mfc_cache **cache;
1722         int ct;
1723 };
1724
1725
1726 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1727 {
1728         struct mfc_cache *mfc;
1729
1730         it->cache = mfc_cache_array;
1731         read_lock(&mrt_lock);
1732         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1733                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1734                         if (pos-- == 0) 
1735                                 return mfc;
1736         read_unlock(&mrt_lock);
1737
1738         it->cache = &mfc_unres_queue;
1739         spin_lock_bh(&mfc_unres_lock);
1740         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1741                 if (pos-- == 0)
1742                         return mfc;
1743         spin_unlock_bh(&mfc_unres_lock);
1744
1745         it->cache = NULL;
1746         return NULL;
1747 }
1748
1749
1750 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1751 {
1752         struct ipmr_mfc_iter *it = seq->private;
1753         it->cache = NULL;
1754         it->ct = 0;
1755         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1756                 : SEQ_START_TOKEN;
1757 }
1758
1759 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1760 {
1761         struct mfc_cache *mfc = v;
1762         struct ipmr_mfc_iter *it = seq->private;
1763
1764         ++*pos;
1765
1766         if (v == SEQ_START_TOKEN)
1767                 return ipmr_mfc_seq_idx(seq->private, 0);
1768
1769         if (mfc->next)
1770                 return mfc->next;
1771         
1772         if (it->cache == &mfc_unres_queue) 
1773                 goto end_of_list;
1774
1775         BUG_ON(it->cache != mfc_cache_array);
1776
1777         while (++it->ct < MFC_LINES) {
1778                 mfc = mfc_cache_array[it->ct];
1779                 if (mfc)
1780                         return mfc;
1781         }
1782
1783         /* exhausted cache_array, show unresolved */
1784         read_unlock(&mrt_lock);
1785         it->cache = &mfc_unres_queue;
1786         it->ct = 0;
1787                 
1788         spin_lock_bh(&mfc_unres_lock);
1789         mfc = mfc_unres_queue;
1790         if (mfc) 
1791                 return mfc;
1792
1793  end_of_list:
1794         spin_unlock_bh(&mfc_unres_lock);
1795         it->cache = NULL;
1796
1797         return NULL;
1798 }
1799
1800 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1801 {
1802         struct ipmr_mfc_iter *it = seq->private;
1803
1804         if (it->cache == &mfc_unres_queue)
1805                 spin_unlock_bh(&mfc_unres_lock);
1806         else if (it->cache == mfc_cache_array)
1807                 read_unlock(&mrt_lock);
1808 }
1809
1810 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1811 {
1812         int n;
1813
1814         if (v == SEQ_START_TOKEN) {
1815                 seq_puts(seq, 
1816                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1817         } else {
1818                 const struct mfc_cache *mfc = v;
1819                 const struct ipmr_mfc_iter *it = seq->private;
1820                 
1821                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1822                            (unsigned long) mfc->mfc_mcastgrp,
1823                            (unsigned long) mfc->mfc_origin,
1824                            mfc->mfc_parent,
1825                            mfc->mfc_un.res.pkt,
1826                            mfc->mfc_un.res.bytes,
1827                            mfc->mfc_un.res.wrong_if);
1828
1829                 if (it->cache != &mfc_unres_queue) {
1830                         for(n = mfc->mfc_un.res.minvif; 
1831                             n < mfc->mfc_un.res.maxvif; n++ ) {
1832                                 if(VIF_EXISTS(n) 
1833                                    && mfc->mfc_un.res.ttls[n] < 255)
1834                                 seq_printf(seq, 
1835                                            " %2d:%-3d", 
1836                                            n, mfc->mfc_un.res.ttls[n]);
1837                         }
1838                 }
1839                 seq_putc(seq, '\n');
1840         }
1841         return 0;
1842 }
1843
1844 static struct seq_operations ipmr_mfc_seq_ops = {
1845         .start = ipmr_mfc_seq_start,
1846         .next  = ipmr_mfc_seq_next,
1847         .stop  = ipmr_mfc_seq_stop,
1848         .show  = ipmr_mfc_seq_show,
1849 };
1850
1851 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1852 {
1853         struct seq_file *seq;
1854         int rc = -ENOMEM;
1855         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1856        
1857         if (!s)
1858                 goto out;
1859
1860         rc = seq_open(file, &ipmr_mfc_seq_ops);
1861         if (rc)
1862                 goto out_kfree;
1863
1864         seq = file->private_data;
1865         seq->private = s;
1866 out:
1867         return rc;
1868 out_kfree:
1869         kfree(s);
1870         goto out;
1871
1872 }
1873
1874 static struct file_operations ipmr_mfc_fops = {
1875         .owner   = THIS_MODULE,
1876         .open    = ipmr_mfc_open,
1877         .read    = seq_read,
1878         .llseek  = seq_lseek,
1879         .release = seq_release_private,
1880 };
1881 #endif  
1882
1883 #ifdef CONFIG_IP_PIMSM_V2
1884 static struct net_protocol pim_protocol = {
1885         .handler        =       pim_rcv,
1886 };
1887 #endif
1888
1889
1890 /*
1891  *      Setup for IP multicast routing
1892  */
1893  
1894 void __init ip_mr_init(void)
1895 {
1896         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1897                                        sizeof(struct mfc_cache),
1898                                        0, SLAB_HWCACHE_ALIGN,
1899                                        NULL, NULL);
1900         if (!mrt_cachep)
1901                 panic("cannot allocate ip_mrt_cache");
1902
1903         init_timer(&ipmr_expire_timer);
1904         ipmr_expire_timer.function=ipmr_expire_process;
1905         register_netdevice_notifier(&ip_mr_notifier);
1906 #ifdef CONFIG_PROC_FS   
1907         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1908         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1909 #endif  
1910 }