Merge git://git.kernel.org/pub/scm/linux/kernel/git/bunk/trivial
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static kmem_cache_t *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static
122 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 {
124         struct net_device  *dev;
125
126         dev = __dev_get_by_name("tunl0");
127
128         if (dev) {
129                 int err;
130                 struct ifreq ifr;
131                 mm_segment_t    oldfs;
132                 struct ip_tunnel_parm p;
133                 struct in_device  *in_dev;
134
135                 memset(&p, 0, sizeof(p));
136                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
137                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
138                 p.iph.version = 4;
139                 p.iph.ihl = 5;
140                 p.iph.protocol = IPPROTO_IPIP;
141                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
142                 ifr.ifr_ifru.ifru_data = (void*)&p;
143
144                 oldfs = get_fs(); set_fs(KERNEL_DS);
145                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
146                 set_fs(oldfs);
147
148                 dev = NULL;
149
150                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
151                         dev->flags |= IFF_MULTICAST;
152
153                         in_dev = __in_dev_get_rtnl(dev);
154                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
155                                 goto failure;
156                         in_dev->cnf.rp_filter = 0;
157
158                         if (dev_open(dev))
159                                 goto failure;
160                 }
161         }
162         return dev;
163
164 failure:
165         /* allow the register to be completed before unregistering. */
166         rtnl_unlock();
167         rtnl_lock();
168
169         unregister_netdevice(dev);
170         return NULL;
171 }
172
173 #ifdef CONFIG_IP_PIMSM
174
175 static int reg_vif_num = -1;
176
177 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
178 {
179         read_lock(&mrt_lock);
180         ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
181         ((struct net_device_stats*)dev->priv)->tx_packets++;
182         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
183         read_unlock(&mrt_lock);
184         kfree_skb(skb);
185         return 0;
186 }
187
188 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
189 {
190         return (struct net_device_stats*)dev->priv;
191 }
192
193 static void reg_vif_setup(struct net_device *dev)
194 {
195         dev->type               = ARPHRD_PIMREG;
196         dev->mtu                = 1500 - sizeof(struct iphdr) - 8;
197         dev->flags              = IFF_NOARP;
198         dev->hard_start_xmit    = reg_vif_xmit;
199         dev->get_stats          = reg_vif_get_stats;
200         dev->destructor         = free_netdev;
201 }
202
203 static struct net_device *ipmr_reg_vif(void)
204 {
205         struct net_device *dev;
206         struct in_device *in_dev;
207
208         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
209                            reg_vif_setup);
210
211         if (dev == NULL)
212                 return NULL;
213
214         if (register_netdevice(dev)) {
215                 free_netdev(dev);
216                 return NULL;
217         }
218         dev->iflink = 0;
219
220         if ((in_dev = inetdev_init(dev)) == NULL)
221                 goto failure;
222
223         in_dev->cnf.rp_filter = 0;
224
225         if (dev_open(dev))
226                 goto failure;
227
228         return dev;
229
230 failure:
231         /* allow the register to be completed before unregistering. */
232         rtnl_unlock();
233         rtnl_lock();
234
235         unregister_netdevice(dev);
236         return NULL;
237 }
238 #endif
239
240 /*
241  *      Delete a VIF entry
242  */
243  
244 static int vif_delete(int vifi)
245 {
246         struct vif_device *v;
247         struct net_device *dev;
248         struct in_device *in_dev;
249
250         if (vifi < 0 || vifi >= maxvif)
251                 return -EADDRNOTAVAIL;
252
253         v = &vif_table[vifi];
254
255         write_lock_bh(&mrt_lock);
256         dev = v->dev;
257         v->dev = NULL;
258
259         if (!dev) {
260                 write_unlock_bh(&mrt_lock);
261                 return -EADDRNOTAVAIL;
262         }
263
264 #ifdef CONFIG_IP_PIMSM
265         if (vifi == reg_vif_num)
266                 reg_vif_num = -1;
267 #endif
268
269         if (vifi+1 == maxvif) {
270                 int tmp;
271                 for (tmp=vifi-1; tmp>=0; tmp--) {
272                         if (VIF_EXISTS(tmp))
273                                 break;
274                 }
275                 maxvif = tmp+1;
276         }
277
278         write_unlock_bh(&mrt_lock);
279
280         dev_set_allmulti(dev, -1);
281
282         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
283                 in_dev->cnf.mc_forwarding--;
284                 ip_rt_multicast_event(in_dev);
285         }
286
287         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
288                 unregister_netdevice(dev);
289
290         dev_put(dev);
291         return 0;
292 }
293
294 /* Destroy an unresolved cache entry, killing queued skbs
295    and reporting error to netlink readers.
296  */
297
298 static void ipmr_destroy_unres(struct mfc_cache *c)
299 {
300         struct sk_buff *skb;
301         struct nlmsgerr *e;
302
303         atomic_dec(&cache_resolve_queue_len);
304
305         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306                 if (skb->nh.iph->version == 0) {
307                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308                         nlh->nlmsg_type = NLMSG_ERROR;
309                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
310                         skb_trim(skb, nlh->nlmsg_len);
311                         e = NLMSG_DATA(nlh);
312                         e->error = -ETIMEDOUT;
313                         memset(&e->msg, 0, sizeof(e->msg));
314                         netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
315                 } else
316                         kfree_skb(skb);
317         }
318
319         kmem_cache_free(mrt_cachep, c);
320 }
321
322
323 /* Single timer process for all the unresolved queue. */
324
325 static void ipmr_expire_process(unsigned long dummy)
326 {
327         unsigned long now;
328         unsigned long expires;
329         struct mfc_cache *c, **cp;
330
331         if (!spin_trylock(&mfc_unres_lock)) {
332                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
333                 return;
334         }
335
336         if (atomic_read(&cache_resolve_queue_len) == 0)
337                 goto out;
338
339         now = jiffies;
340         expires = 10*HZ;
341         cp = &mfc_unres_queue;
342
343         while ((c=*cp) != NULL) {
344                 if (time_after(c->mfc_un.unres.expires, now)) {
345                         unsigned long interval = c->mfc_un.unres.expires - now;
346                         if (interval < expires)
347                                 expires = interval;
348                         cp = &c->next;
349                         continue;
350                 }
351
352                 *cp = c->next;
353
354                 ipmr_destroy_unres(c);
355         }
356
357         if (atomic_read(&cache_resolve_queue_len))
358                 mod_timer(&ipmr_expire_timer, jiffies + expires);
359
360 out:
361         spin_unlock(&mfc_unres_lock);
362 }
363
364 /* Fill oifs list. It is called under write locked mrt_lock. */
365
366 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
367 {
368         int vifi;
369
370         cache->mfc_un.res.minvif = MAXVIFS;
371         cache->mfc_un.res.maxvif = 0;
372         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
373
374         for (vifi=0; vifi<maxvif; vifi++) {
375                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
376                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
377                         if (cache->mfc_un.res.minvif > vifi)
378                                 cache->mfc_un.res.minvif = vifi;
379                         if (cache->mfc_un.res.maxvif <= vifi)
380                                 cache->mfc_un.res.maxvif = vifi + 1;
381                 }
382         }
383 }
384
385 static int vif_add(struct vifctl *vifc, int mrtsock)
386 {
387         int vifi = vifc->vifc_vifi;
388         struct vif_device *v = &vif_table[vifi];
389         struct net_device *dev;
390         struct in_device *in_dev;
391
392         /* Is vif busy ? */
393         if (VIF_EXISTS(vifi))
394                 return -EADDRINUSE;
395
396         switch (vifc->vifc_flags) {
397 #ifdef CONFIG_IP_PIMSM
398         case VIFF_REGISTER:
399                 /*
400                  * Special Purpose VIF in PIM
401                  * All the packets will be sent to the daemon
402                  */
403                 if (reg_vif_num >= 0)
404                         return -EADDRINUSE;
405                 dev = ipmr_reg_vif();
406                 if (!dev)
407                         return -ENOBUFS;
408                 break;
409 #endif
410         case VIFF_TUNNEL:       
411                 dev = ipmr_new_tunnel(vifc);
412                 if (!dev)
413                         return -ENOBUFS;
414                 break;
415         case 0:
416                 dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
417                 if (!dev)
418                         return -EADDRNOTAVAIL;
419                 __dev_put(dev);
420                 break;
421         default:
422                 return -EINVAL;
423         }
424
425         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
426                 return -EADDRNOTAVAIL;
427         in_dev->cnf.mc_forwarding++;
428         dev_set_allmulti(dev, +1);
429         ip_rt_multicast_event(in_dev);
430
431         /*
432          *      Fill in the VIF structures
433          */
434         v->rate_limit=vifc->vifc_rate_limit;
435         v->local=vifc->vifc_lcl_addr.s_addr;
436         v->remote=vifc->vifc_rmt_addr.s_addr;
437         v->flags=vifc->vifc_flags;
438         if (!mrtsock)
439                 v->flags |= VIFF_STATIC;
440         v->threshold=vifc->vifc_threshold;
441         v->bytes_in = 0;
442         v->bytes_out = 0;
443         v->pkt_in = 0;
444         v->pkt_out = 0;
445         v->link = dev->ifindex;
446         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
447                 v->link = dev->iflink;
448
449         /* And finish update writing critical data */
450         write_lock_bh(&mrt_lock);
451         dev_hold(dev);
452         v->dev=dev;
453 #ifdef CONFIG_IP_PIMSM
454         if (v->flags&VIFF_REGISTER)
455                 reg_vif_num = vifi;
456 #endif
457         if (vifi+1 > maxvif)
458                 maxvif = vifi+1;
459         write_unlock_bh(&mrt_lock);
460         return 0;
461 }
462
463 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
464 {
465         int line=MFC_HASH(mcastgrp,origin);
466         struct mfc_cache *c;
467
468         for (c=mfc_cache_array[line]; c; c = c->next) {
469                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
470                         break;
471         }
472         return c;
473 }
474
475 /*
476  *      Allocate a multicast cache entry
477  */
478 static struct mfc_cache *ipmr_cache_alloc(void)
479 {
480         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
481         if(c==NULL)
482                 return NULL;
483         memset(c, 0, sizeof(*c));
484         c->mfc_un.res.minvif = MAXVIFS;
485         return c;
486 }
487
488 static struct mfc_cache *ipmr_cache_alloc_unres(void)
489 {
490         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
491         if(c==NULL)
492                 return NULL;
493         memset(c, 0, sizeof(*c));
494         skb_queue_head_init(&c->mfc_un.unres.unresolved);
495         c->mfc_un.unres.expires = jiffies + 10*HZ;
496         return c;
497 }
498
499 /*
500  *      A cache entry has gone into a resolved state from queued
501  */
502  
503 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
504 {
505         struct sk_buff *skb;
506         struct nlmsgerr *e;
507
508         /*
509          *      Play the pending entries through our router
510          */
511
512         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
513                 if (skb->nh.iph->version == 0) {
514                         int err;
515                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
516
517                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
518                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
519                         } else {
520                                 nlh->nlmsg_type = NLMSG_ERROR;
521                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
522                                 skb_trim(skb, nlh->nlmsg_len);
523                                 e = NLMSG_DATA(nlh);
524                                 e->error = -EMSGSIZE;
525                                 memset(&e->msg, 0, sizeof(e->msg));
526                         }
527                         err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
528                 } else
529                         ip_mr_forward(skb, c, 0);
530         }
531 }
532
533 /*
534  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
535  *      expects the following bizarre scheme.
536  *
537  *      Called under mrt_lock.
538  */
539  
540 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
541 {
542         struct sk_buff *skb;
543         int ihl = pkt->nh.iph->ihl<<2;
544         struct igmphdr *igmp;
545         struct igmpmsg *msg;
546         int ret;
547
548 #ifdef CONFIG_IP_PIMSM
549         if (assert == IGMPMSG_WHOLEPKT)
550                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
551         else
552 #endif
553                 skb = alloc_skb(128, GFP_ATOMIC);
554
555         if(!skb)
556                 return -ENOBUFS;
557
558 #ifdef CONFIG_IP_PIMSM
559         if (assert == IGMPMSG_WHOLEPKT) {
560                 /* Ugly, but we have no choice with this interface.
561                    Duplicate old header, fix ihl, length etc.
562                    And all this only to mangle msg->im_msgtype and
563                    to set msg->im_mbz to "mbz" :-)
564                  */
565                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
566                 skb->nh.raw = skb->h.raw = (u8*)msg;
567                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
568                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
569                 msg->im_mbz = 0;
570                 msg->im_vif = reg_vif_num;
571                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
572                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
573         } else 
574 #endif
575         {       
576                 
577         /*
578          *      Copy the IP header
579          */
580
581         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
582         memcpy(skb->data,pkt->data,ihl);
583         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
584         msg = (struct igmpmsg*)skb->nh.iph;
585         msg->im_vif = vifi;
586         skb->dst = dst_clone(pkt->dst);
587
588         /*
589          *      Add our header
590          */
591
592         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
593         igmp->type      =
594         msg->im_msgtype = assert;
595         igmp->code      =       0;
596         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
597         skb->h.raw = skb->nh.raw;
598         }
599
600         if (mroute_socket == NULL) {
601                 kfree_skb(skb);
602                 return -EINVAL;
603         }
604
605         /*
606          *      Deliver to mrouted
607          */
608         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
609                 if (net_ratelimit())
610                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
611                 kfree_skb(skb);
612         }
613
614         return ret;
615 }
616
617 /*
618  *      Queue a packet for resolution. It gets locked cache entry!
619  */
620  
621 static int
622 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
623 {
624         int err;
625         struct mfc_cache *c;
626
627         spin_lock_bh(&mfc_unres_lock);
628         for (c=mfc_unres_queue; c; c=c->next) {
629                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
630                     c->mfc_origin == skb->nh.iph->saddr)
631                         break;
632         }
633
634         if (c == NULL) {
635                 /*
636                  *      Create a new entry if allowable
637                  */
638
639                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
640                     (c=ipmr_cache_alloc_unres())==NULL) {
641                         spin_unlock_bh(&mfc_unres_lock);
642
643                         kfree_skb(skb);
644                         return -ENOBUFS;
645                 }
646
647                 /*
648                  *      Fill in the new cache entry
649                  */
650                 c->mfc_parent=-1;
651                 c->mfc_origin=skb->nh.iph->saddr;
652                 c->mfc_mcastgrp=skb->nh.iph->daddr;
653
654                 /*
655                  *      Reflect first query at mrouted.
656                  */
657                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
658                         /* If the report failed throw the cache entry 
659                            out - Brad Parker
660                          */
661                         spin_unlock_bh(&mfc_unres_lock);
662
663                         kmem_cache_free(mrt_cachep, c);
664                         kfree_skb(skb);
665                         return err;
666                 }
667
668                 atomic_inc(&cache_resolve_queue_len);
669                 c->next = mfc_unres_queue;
670                 mfc_unres_queue = c;
671
672                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
673         }
674
675         /*
676          *      See if we can append the packet
677          */
678         if (c->mfc_un.unres.unresolved.qlen>3) {
679                 kfree_skb(skb);
680                 err = -ENOBUFS;
681         } else {
682                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
683                 err = 0;
684         }
685
686         spin_unlock_bh(&mfc_unres_lock);
687         return err;
688 }
689
690 /*
691  *      MFC cache manipulation by user space mroute daemon
692  */
693
694 static int ipmr_mfc_delete(struct mfcctl *mfc)
695 {
696         int line;
697         struct mfc_cache *c, **cp;
698
699         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
700
701         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
702                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
703                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
704                         write_lock_bh(&mrt_lock);
705                         *cp = c->next;
706                         write_unlock_bh(&mrt_lock);
707
708                         kmem_cache_free(mrt_cachep, c);
709                         return 0;
710                 }
711         }
712         return -ENOENT;
713 }
714
715 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
716 {
717         int line;
718         struct mfc_cache *uc, *c, **cp;
719
720         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
721
722         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
723                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
724                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
725                         break;
726         }
727
728         if (c != NULL) {
729                 write_lock_bh(&mrt_lock);
730                 c->mfc_parent = mfc->mfcc_parent;
731                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
732                 if (!mrtsock)
733                         c->mfc_flags |= MFC_STATIC;
734                 write_unlock_bh(&mrt_lock);
735                 return 0;
736         }
737
738         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
739                 return -EINVAL;
740
741         c=ipmr_cache_alloc();
742         if (c==NULL)
743                 return -ENOMEM;
744
745         c->mfc_origin=mfc->mfcc_origin.s_addr;
746         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
747         c->mfc_parent=mfc->mfcc_parent;
748         ipmr_update_thresholds(c, mfc->mfcc_ttls);
749         if (!mrtsock)
750                 c->mfc_flags |= MFC_STATIC;
751
752         write_lock_bh(&mrt_lock);
753         c->next = mfc_cache_array[line];
754         mfc_cache_array[line] = c;
755         write_unlock_bh(&mrt_lock);
756
757         /*
758          *      Check to see if we resolved a queued list. If so we
759          *      need to send on the frames and tidy up.
760          */
761         spin_lock_bh(&mfc_unres_lock);
762         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
763              cp = &uc->next) {
764                 if (uc->mfc_origin == c->mfc_origin &&
765                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
766                         *cp = uc->next;
767                         if (atomic_dec_and_test(&cache_resolve_queue_len))
768                                 del_timer(&ipmr_expire_timer);
769                         break;
770                 }
771         }
772         spin_unlock_bh(&mfc_unres_lock);
773
774         if (uc) {
775                 ipmr_cache_resolve(uc, c);
776                 kmem_cache_free(mrt_cachep, uc);
777         }
778         return 0;
779 }
780
781 /*
782  *      Close the multicast socket, and clear the vif tables etc
783  */
784  
785 static void mroute_clean_tables(struct sock *sk)
786 {
787         int i;
788                 
789         /*
790          *      Shut down all active vif entries
791          */
792         for(i=0; i<maxvif; i++) {
793                 if (!(vif_table[i].flags&VIFF_STATIC))
794                         vif_delete(i);
795         }
796
797         /*
798          *      Wipe the cache
799          */
800         for (i=0;i<MFC_LINES;i++) {
801                 struct mfc_cache *c, **cp;
802
803                 cp = &mfc_cache_array[i];
804                 while ((c = *cp) != NULL) {
805                         if (c->mfc_flags&MFC_STATIC) {
806                                 cp = &c->next;
807                                 continue;
808                         }
809                         write_lock_bh(&mrt_lock);
810                         *cp = c->next;
811                         write_unlock_bh(&mrt_lock);
812
813                         kmem_cache_free(mrt_cachep, c);
814                 }
815         }
816
817         if (atomic_read(&cache_resolve_queue_len) != 0) {
818                 struct mfc_cache *c;
819
820                 spin_lock_bh(&mfc_unres_lock);
821                 while (mfc_unres_queue != NULL) {
822                         c = mfc_unres_queue;
823                         mfc_unres_queue = c->next;
824                         spin_unlock_bh(&mfc_unres_lock);
825
826                         ipmr_destroy_unres(c);
827
828                         spin_lock_bh(&mfc_unres_lock);
829                 }
830                 spin_unlock_bh(&mfc_unres_lock);
831         }
832 }
833
834 static void mrtsock_destruct(struct sock *sk)
835 {
836         rtnl_lock();
837         if (sk == mroute_socket) {
838                 ipv4_devconf.mc_forwarding--;
839
840                 write_lock_bh(&mrt_lock);
841                 mroute_socket=NULL;
842                 write_unlock_bh(&mrt_lock);
843
844                 mroute_clean_tables(sk);
845         }
846         rtnl_unlock();
847 }
848
849 /*
850  *      Socket options and virtual interface manipulation. The whole
851  *      virtual interface system is a complete heap, but unfortunately
852  *      that's how BSD mrouted happens to think. Maybe one day with a proper
853  *      MOSPF/PIM router set up we can clean this up.
854  */
855  
856 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
857 {
858         int ret;
859         struct vifctl vif;
860         struct mfcctl mfc;
861         
862         if(optname!=MRT_INIT)
863         {
864                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
865                         return -EACCES;
866         }
867
868         switch(optname)
869         {
870                 case MRT_INIT:
871                         if (sk->sk_type != SOCK_RAW ||
872                             inet_sk(sk)->num != IPPROTO_IGMP)
873                                 return -EOPNOTSUPP;
874                         if(optlen!=sizeof(int))
875                                 return -ENOPROTOOPT;
876
877                         rtnl_lock();
878                         if (mroute_socket) {
879                                 rtnl_unlock();
880                                 return -EADDRINUSE;
881                         }
882
883                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
884                         if (ret == 0) {
885                                 write_lock_bh(&mrt_lock);
886                                 mroute_socket=sk;
887                                 write_unlock_bh(&mrt_lock);
888
889                                 ipv4_devconf.mc_forwarding++;
890                         }
891                         rtnl_unlock();
892                         return ret;
893                 case MRT_DONE:
894                         if (sk!=mroute_socket)
895                                 return -EACCES;
896                         return ip_ra_control(sk, 0, NULL);
897                 case MRT_ADD_VIF:
898                 case MRT_DEL_VIF:
899                         if(optlen!=sizeof(vif))
900                                 return -EINVAL;
901                         if (copy_from_user(&vif,optval,sizeof(vif)))
902                                 return -EFAULT; 
903                         if(vif.vifc_vifi >= MAXVIFS)
904                                 return -ENFILE;
905                         rtnl_lock();
906                         if (optname==MRT_ADD_VIF) {
907                                 ret = vif_add(&vif, sk==mroute_socket);
908                         } else {
909                                 ret = vif_delete(vif.vifc_vifi);
910                         }
911                         rtnl_unlock();
912                         return ret;
913
914                 /*
915                  *      Manipulate the forwarding caches. These live
916                  *      in a sort of kernel/user symbiosis.
917                  */
918                 case MRT_ADD_MFC:
919                 case MRT_DEL_MFC:
920                         if(optlen!=sizeof(mfc))
921                                 return -EINVAL;
922                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
923                                 return -EFAULT;
924                         rtnl_lock();
925                         if (optname==MRT_DEL_MFC)
926                                 ret = ipmr_mfc_delete(&mfc);
927                         else
928                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
929                         rtnl_unlock();
930                         return ret;
931                 /*
932                  *      Control PIM assert.
933                  */
934                 case MRT_ASSERT:
935                 {
936                         int v;
937                         if(get_user(v,(int __user *)optval))
938                                 return -EFAULT;
939                         mroute_do_assert=(v)?1:0;
940                         return 0;
941                 }
942 #ifdef CONFIG_IP_PIMSM
943                 case MRT_PIM:
944                 {
945                         int v, ret;
946                         if(get_user(v,(int __user *)optval))
947                                 return -EFAULT;
948                         v = (v)?1:0;
949                         rtnl_lock();
950                         ret = 0;
951                         if (v != mroute_do_pim) {
952                                 mroute_do_pim = v;
953                                 mroute_do_assert = v;
954 #ifdef CONFIG_IP_PIMSM_V2
955                                 if (mroute_do_pim)
956                                         ret = inet_add_protocol(&pim_protocol,
957                                                                 IPPROTO_PIM);
958                                 else
959                                         ret = inet_del_protocol(&pim_protocol,
960                                                                 IPPROTO_PIM);
961                                 if (ret < 0)
962                                         ret = -EAGAIN;
963 #endif
964                         }
965                         rtnl_unlock();
966                         return ret;
967                 }
968 #endif
969                 /*
970                  *      Spurious command, or MRT_VERSION which you cannot
971                  *      set.
972                  */
973                 default:
974                         return -ENOPROTOOPT;
975         }
976 }
977
978 /*
979  *      Getsock opt support for the multicast routing system.
980  */
981  
982 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
983 {
984         int olr;
985         int val;
986
987         if(optname!=MRT_VERSION && 
988 #ifdef CONFIG_IP_PIMSM
989            optname!=MRT_PIM &&
990 #endif
991            optname!=MRT_ASSERT)
992                 return -ENOPROTOOPT;
993
994         if (get_user(olr, optlen))
995                 return -EFAULT;
996
997         olr = min_t(unsigned int, olr, sizeof(int));
998         if (olr < 0)
999                 return -EINVAL;
1000                 
1001         if(put_user(olr,optlen))
1002                 return -EFAULT;
1003         if(optname==MRT_VERSION)
1004                 val=0x0305;
1005 #ifdef CONFIG_IP_PIMSM
1006         else if(optname==MRT_PIM)
1007                 val=mroute_do_pim;
1008 #endif
1009         else
1010                 val=mroute_do_assert;
1011         if(copy_to_user(optval,&val,olr))
1012                 return -EFAULT;
1013         return 0;
1014 }
1015
1016 /*
1017  *      The IP multicast ioctl support routines.
1018  */
1019  
1020 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1021 {
1022         struct sioc_sg_req sr;
1023         struct sioc_vif_req vr;
1024         struct vif_device *vif;
1025         struct mfc_cache *c;
1026         
1027         switch(cmd)
1028         {
1029                 case SIOCGETVIFCNT:
1030                         if (copy_from_user(&vr,arg,sizeof(vr)))
1031                                 return -EFAULT; 
1032                         if(vr.vifi>=maxvif)
1033                                 return -EINVAL;
1034                         read_lock(&mrt_lock);
1035                         vif=&vif_table[vr.vifi];
1036                         if(VIF_EXISTS(vr.vifi)) {
1037                                 vr.icount=vif->pkt_in;
1038                                 vr.ocount=vif->pkt_out;
1039                                 vr.ibytes=vif->bytes_in;
1040                                 vr.obytes=vif->bytes_out;
1041                                 read_unlock(&mrt_lock);
1042
1043                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1044                                         return -EFAULT;
1045                                 return 0;
1046                         }
1047                         read_unlock(&mrt_lock);
1048                         return -EADDRNOTAVAIL;
1049                 case SIOCGETSGCNT:
1050                         if (copy_from_user(&sr,arg,sizeof(sr)))
1051                                 return -EFAULT;
1052
1053                         read_lock(&mrt_lock);
1054                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1055                         if (c) {
1056                                 sr.pktcnt = c->mfc_un.res.pkt;
1057                                 sr.bytecnt = c->mfc_un.res.bytes;
1058                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1059                                 read_unlock(&mrt_lock);
1060
1061                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1062                                         return -EFAULT;
1063                                 return 0;
1064                         }
1065                         read_unlock(&mrt_lock);
1066                         return -EADDRNOTAVAIL;
1067                 default:
1068                         return -ENOIOCTLCMD;
1069         }
1070 }
1071
1072
1073 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1074 {
1075         struct vif_device *v;
1076         int ct;
1077         if (event != NETDEV_UNREGISTER)
1078                 return NOTIFY_DONE;
1079         v=&vif_table[0];
1080         for(ct=0;ct<maxvif;ct++,v++) {
1081                 if (v->dev==ptr)
1082                         vif_delete(ct);
1083         }
1084         return NOTIFY_DONE;
1085 }
1086
1087
1088 static struct notifier_block ip_mr_notifier={
1089         .notifier_call = ipmr_device_event,
1090 };
1091
1092 /*
1093  *      Encapsulate a packet by attaching a valid IPIP header to it.
1094  *      This avoids tunnel drivers and other mess and gives us the speed so
1095  *      important for multicast video.
1096  */
1097  
1098 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1099 {
1100         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1101
1102         iph->version    =       4;
1103         iph->tos        =       skb->nh.iph->tos;
1104         iph->ttl        =       skb->nh.iph->ttl;
1105         iph->frag_off   =       0;
1106         iph->daddr      =       daddr;
1107         iph->saddr      =       saddr;
1108         iph->protocol   =       IPPROTO_IPIP;
1109         iph->ihl        =       5;
1110         iph->tot_len    =       htons(skb->len);
1111         ip_select_ident(iph, skb->dst, NULL);
1112         ip_send_check(iph);
1113
1114         skb->h.ipiph = skb->nh.iph;
1115         skb->nh.iph = iph;
1116         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1117         nf_reset(skb);
1118 }
1119
1120 static inline int ipmr_forward_finish(struct sk_buff *skb)
1121 {
1122         struct ip_options * opt = &(IPCB(skb)->opt);
1123
1124         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1125
1126         if (unlikely(opt->optlen))
1127                 ip_forward_options(skb);
1128
1129         return dst_output(skb);
1130 }
1131
1132 /*
1133  *      Processing handlers for ipmr_forward
1134  */
1135
1136 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1137 {
1138         struct iphdr *iph = skb->nh.iph;
1139         struct vif_device *vif = &vif_table[vifi];
1140         struct net_device *dev;
1141         struct rtable *rt;
1142         int    encap = 0;
1143
1144         if (vif->dev == NULL)
1145                 goto out_free;
1146
1147 #ifdef CONFIG_IP_PIMSM
1148         if (vif->flags & VIFF_REGISTER) {
1149                 vif->pkt_out++;
1150                 vif->bytes_out+=skb->len;
1151                 ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1152                 ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1153                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1154                 kfree_skb(skb);
1155                 return;
1156         }
1157 #endif
1158
1159         if (vif->flags&VIFF_TUNNEL) {
1160                 struct flowi fl = { .oif = vif->link,
1161                                     .nl_u = { .ip4_u =
1162                                               { .daddr = vif->remote,
1163                                                 .saddr = vif->local,
1164                                                 .tos = RT_TOS(iph->tos) } },
1165                                     .proto = IPPROTO_IPIP };
1166                 if (ip_route_output_key(&rt, &fl))
1167                         goto out_free;
1168                 encap = sizeof(struct iphdr);
1169         } else {
1170                 struct flowi fl = { .oif = vif->link,
1171                                     .nl_u = { .ip4_u =
1172                                               { .daddr = iph->daddr,
1173                                                 .tos = RT_TOS(iph->tos) } },
1174                                     .proto = IPPROTO_IPIP };
1175                 if (ip_route_output_key(&rt, &fl))
1176                         goto out_free;
1177         }
1178
1179         dev = rt->u.dst.dev;
1180
1181         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1182                 /* Do not fragment multicasts. Alas, IPv4 does not
1183                    allow to send ICMP, so that packets will disappear
1184                    to blackhole.
1185                  */
1186
1187                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1188                 ip_rt_put(rt);
1189                 goto out_free;
1190         }
1191
1192         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1193
1194         if (skb_cow(skb, encap)) {
1195                 ip_rt_put(rt);
1196                 goto out_free;
1197         }
1198
1199         vif->pkt_out++;
1200         vif->bytes_out+=skb->len;
1201
1202         dst_release(skb->dst);
1203         skb->dst = &rt->u.dst;
1204         iph = skb->nh.iph;
1205         ip_decrease_ttl(iph);
1206
1207         /* FIXME: forward and output firewalls used to be called here.
1208          * What do we do with netfilter? -- RR */
1209         if (vif->flags & VIFF_TUNNEL) {
1210                 ip_encap(skb, vif->local, vif->remote);
1211                 /* FIXME: extra output firewall step used to be here. --RR */
1212                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1213                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1214         }
1215
1216         IPCB(skb)->flags |= IPSKB_FORWARDED;
1217
1218         /*
1219          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1220          * not only before forwarding, but after forwarding on all output
1221          * interfaces. It is clear, if mrouter runs a multicasting
1222          * program, it should receive packets not depending to what interface
1223          * program is joined.
1224          * If we will not make it, the program will have to join on all
1225          * interfaces. On the other hand, multihoming host (or router, but
1226          * not mrouter) cannot join to more than one interface - it will
1227          * result in receiving multiple packets.
1228          */
1229         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1230                 ipmr_forward_finish);
1231         return;
1232
1233 out_free:
1234         kfree_skb(skb);
1235         return;
1236 }
1237
1238 static int ipmr_find_vif(struct net_device *dev)
1239 {
1240         int ct;
1241         for (ct=maxvif-1; ct>=0; ct--) {
1242                 if (vif_table[ct].dev == dev)
1243                         break;
1244         }
1245         return ct;
1246 }
1247
1248 /* "local" means that we should preserve one skb (for local delivery) */
1249
1250 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1251 {
1252         int psend = -1;
1253         int vif, ct;
1254
1255         vif = cache->mfc_parent;
1256         cache->mfc_un.res.pkt++;
1257         cache->mfc_un.res.bytes += skb->len;
1258
1259         /*
1260          * Wrong interface: drop packet and (maybe) send PIM assert.
1261          */
1262         if (vif_table[vif].dev != skb->dev) {
1263                 int true_vifi;
1264
1265                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1266                         /* It is our own packet, looped back.
1267                            Very complicated situation...
1268
1269                            The best workaround until routing daemons will be
1270                            fixed is not to redistribute packet, if it was
1271                            send through wrong interface. It means, that
1272                            multicast applications WILL NOT work for
1273                            (S,G), which have default multicast route pointing
1274                            to wrong oif. In any case, it is not a good
1275                            idea to use multicasting applications on router.
1276                          */
1277                         goto dont_forward;
1278                 }
1279
1280                 cache->mfc_un.res.wrong_if++;
1281                 true_vifi = ipmr_find_vif(skb->dev);
1282
1283                 if (true_vifi >= 0 && mroute_do_assert &&
1284                     /* pimsm uses asserts, when switching from RPT to SPT,
1285                        so that we cannot check that packet arrived on an oif.
1286                        It is bad, but otherwise we would need to move pretty
1287                        large chunk of pimd to kernel. Ough... --ANK
1288                      */
1289                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1290                     time_after(jiffies, 
1291                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1292                         cache->mfc_un.res.last_assert = jiffies;
1293                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1294                 }
1295                 goto dont_forward;
1296         }
1297
1298         vif_table[vif].pkt_in++;
1299         vif_table[vif].bytes_in+=skb->len;
1300
1301         /*
1302          *      Forward the frame
1303          */
1304         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1305                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1306                         if (psend != -1) {
1307                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1308                                 if (skb2)
1309                                         ipmr_queue_xmit(skb2, cache, psend);
1310                         }
1311                         psend=ct;
1312                 }
1313         }
1314         if (psend != -1) {
1315                 if (local) {
1316                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1317                         if (skb2)
1318                                 ipmr_queue_xmit(skb2, cache, psend);
1319                 } else {
1320                         ipmr_queue_xmit(skb, cache, psend);
1321                         return 0;
1322                 }
1323         }
1324
1325 dont_forward:
1326         if (!local)
1327                 kfree_skb(skb);
1328         return 0;
1329 }
1330
1331
1332 /*
1333  *      Multicast packets for forwarding arrive here
1334  */
1335
1336 int ip_mr_input(struct sk_buff *skb)
1337 {
1338         struct mfc_cache *cache;
1339         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1340
1341         /* Packet is looped back after forward, it should not be
1342            forwarded second time, but still can be delivered locally.
1343          */
1344         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1345                 goto dont_forward;
1346
1347         if (!local) {
1348                     if (IPCB(skb)->opt.router_alert) {
1349                             if (ip_call_ra_chain(skb))
1350                                     return 0;
1351                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1352                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1353                                Cisco IOS <= 11.2(8)) do not put router alert
1354                                option to IGMP packets destined to routable
1355                                groups. It is very bad, because it means
1356                                that we can forward NO IGMP messages.
1357                              */
1358                             read_lock(&mrt_lock);
1359                             if (mroute_socket) {
1360                                     nf_reset(skb);
1361                                     raw_rcv(mroute_socket, skb);
1362                                     read_unlock(&mrt_lock);
1363                                     return 0;
1364                             }
1365                             read_unlock(&mrt_lock);
1366                     }
1367         }
1368
1369         read_lock(&mrt_lock);
1370         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1371
1372         /*
1373          *      No usable cache entry
1374          */
1375         if (cache==NULL) {
1376                 int vif;
1377
1378                 if (local) {
1379                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1380                         ip_local_deliver(skb);
1381                         if (skb2 == NULL) {
1382                                 read_unlock(&mrt_lock);
1383                                 return -ENOBUFS;
1384                         }
1385                         skb = skb2;
1386                 }
1387
1388                 vif = ipmr_find_vif(skb->dev);
1389                 if (vif >= 0) {
1390                         int err = ipmr_cache_unresolved(vif, skb);
1391                         read_unlock(&mrt_lock);
1392
1393                         return err;
1394                 }
1395                 read_unlock(&mrt_lock);
1396                 kfree_skb(skb);
1397                 return -ENODEV;
1398         }
1399
1400         ip_mr_forward(skb, cache, local);
1401
1402         read_unlock(&mrt_lock);
1403
1404         if (local)
1405                 return ip_local_deliver(skb);
1406
1407         return 0;
1408
1409 dont_forward:
1410         if (local)
1411                 return ip_local_deliver(skb);
1412         kfree_skb(skb);
1413         return 0;
1414 }
1415
1416 #ifdef CONFIG_IP_PIMSM_V1
1417 /*
1418  * Handle IGMP messages of PIMv1
1419  */
1420
1421 int pim_rcv_v1(struct sk_buff * skb)
1422 {
1423         struct igmphdr *pim;
1424         struct iphdr   *encap;
1425         struct net_device  *reg_dev = NULL;
1426
1427         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1428                 goto drop;
1429
1430         pim = (struct igmphdr*)skb->h.raw;
1431
1432         if (!mroute_do_pim ||
1433             skb->len < sizeof(*pim) + sizeof(*encap) ||
1434             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1435                 goto drop;
1436
1437         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1438         /*
1439            Check that:
1440            a. packet is really destinted to a multicast group
1441            b. packet is not a NULL-REGISTER
1442            c. packet is not truncated
1443          */
1444         if (!MULTICAST(encap->daddr) ||
1445             encap->tot_len == 0 ||
1446             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1447                 goto drop;
1448
1449         read_lock(&mrt_lock);
1450         if (reg_vif_num >= 0)
1451                 reg_dev = vif_table[reg_vif_num].dev;
1452         if (reg_dev)
1453                 dev_hold(reg_dev);
1454         read_unlock(&mrt_lock);
1455
1456         if (reg_dev == NULL) 
1457                 goto drop;
1458
1459         skb->mac.raw = skb->nh.raw;
1460         skb_pull(skb, (u8*)encap - skb->data);
1461         skb->nh.iph = (struct iphdr *)skb->data;
1462         skb->dev = reg_dev;
1463         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1464         skb->protocol = htons(ETH_P_IP);
1465         skb->ip_summed = 0;
1466         skb->pkt_type = PACKET_HOST;
1467         dst_release(skb->dst);
1468         skb->dst = NULL;
1469         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1470         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1471         nf_reset(skb);
1472         netif_rx(skb);
1473         dev_put(reg_dev);
1474         return 0;
1475  drop:
1476         kfree_skb(skb);
1477         return 0;
1478 }
1479 #endif
1480
1481 #ifdef CONFIG_IP_PIMSM_V2
1482 static int pim_rcv(struct sk_buff * skb)
1483 {
1484         struct pimreghdr *pim;
1485         struct iphdr   *encap;
1486         struct net_device  *reg_dev = NULL;
1487
1488         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1489                 goto drop;
1490
1491         pim = (struct pimreghdr*)skb->h.raw;
1492         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1493             (pim->flags&PIM_NULL_REGISTER) ||
1494             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1495              (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
1496                 goto drop;
1497
1498         /* check if the inner packet is destined to mcast group */
1499         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1500         if (!MULTICAST(encap->daddr) ||
1501             encap->tot_len == 0 ||
1502             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1503                 goto drop;
1504
1505         read_lock(&mrt_lock);
1506         if (reg_vif_num >= 0)
1507                 reg_dev = vif_table[reg_vif_num].dev;
1508         if (reg_dev)
1509                 dev_hold(reg_dev);
1510         read_unlock(&mrt_lock);
1511
1512         if (reg_dev == NULL) 
1513                 goto drop;
1514
1515         skb->mac.raw = skb->nh.raw;
1516         skb_pull(skb, (u8*)encap - skb->data);
1517         skb->nh.iph = (struct iphdr *)skb->data;
1518         skb->dev = reg_dev;
1519         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1520         skb->protocol = htons(ETH_P_IP);
1521         skb->ip_summed = 0;
1522         skb->pkt_type = PACKET_HOST;
1523         dst_release(skb->dst);
1524         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1525         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1526         skb->dst = NULL;
1527         nf_reset(skb);
1528         netif_rx(skb);
1529         dev_put(reg_dev);
1530         return 0;
1531  drop:
1532         kfree_skb(skb);
1533         return 0;
1534 }
1535 #endif
1536
1537 static int
1538 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1539 {
1540         int ct;
1541         struct rtnexthop *nhp;
1542         struct net_device *dev = vif_table[c->mfc_parent].dev;
1543         u8 *b = skb->tail;
1544         struct rtattr *mp_head;
1545
1546         if (dev)
1547                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1548
1549         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1550
1551         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1552                 if (c->mfc_un.res.ttls[ct] < 255) {
1553                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1554                                 goto rtattr_failure;
1555                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1556                         nhp->rtnh_flags = 0;
1557                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1558                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1559                         nhp->rtnh_len = sizeof(*nhp);
1560                 }
1561         }
1562         mp_head->rta_type = RTA_MULTIPATH;
1563         mp_head->rta_len = skb->tail - (u8*)mp_head;
1564         rtm->rtm_type = RTN_MULTICAST;
1565         return 1;
1566
1567 rtattr_failure:
1568         skb_trim(skb, b - skb->data);
1569         return -EMSGSIZE;
1570 }
1571
1572 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1573 {
1574         int err;
1575         struct mfc_cache *cache;
1576         struct rtable *rt = (struct rtable*)skb->dst;
1577
1578         read_lock(&mrt_lock);
1579         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1580
1581         if (cache==NULL) {
1582                 struct net_device *dev;
1583                 int vif;
1584
1585                 if (nowait) {
1586                         read_unlock(&mrt_lock);
1587                         return -EAGAIN;
1588                 }
1589
1590                 dev = skb->dev;
1591                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1592                         read_unlock(&mrt_lock);
1593                         return -ENODEV;
1594                 }
1595                 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1596                 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1597                 skb->nh.iph->saddr = rt->rt_src;
1598                 skb->nh.iph->daddr = rt->rt_dst;
1599                 skb->nh.iph->version = 0;
1600                 err = ipmr_cache_unresolved(vif, skb);
1601                 read_unlock(&mrt_lock);
1602                 return err;
1603         }
1604
1605         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1606                 cache->mfc_flags |= MFC_NOTIFY;
1607         err = ipmr_fill_mroute(skb, cache, rtm);
1608         read_unlock(&mrt_lock);
1609         return err;
1610 }
1611
1612 #ifdef CONFIG_PROC_FS   
1613 /*
1614  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1615  */
1616 struct ipmr_vif_iter {
1617         int ct;
1618 };
1619
1620 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1621                                            loff_t pos)
1622 {
1623         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1624                 if(!VIF_EXISTS(iter->ct))
1625                         continue;
1626                 if (pos-- == 0) 
1627                         return &vif_table[iter->ct];
1628         }
1629         return NULL;
1630 }
1631
1632 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1633 {
1634         read_lock(&mrt_lock);
1635         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1636                 : SEQ_START_TOKEN;
1637 }
1638
1639 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1640 {
1641         struct ipmr_vif_iter *iter = seq->private;
1642
1643         ++*pos;
1644         if (v == SEQ_START_TOKEN)
1645                 return ipmr_vif_seq_idx(iter, 0);
1646         
1647         while (++iter->ct < maxvif) {
1648                 if(!VIF_EXISTS(iter->ct))
1649                         continue;
1650                 return &vif_table[iter->ct];
1651         }
1652         return NULL;
1653 }
1654
1655 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1656 {
1657         read_unlock(&mrt_lock);
1658 }
1659
1660 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1661 {
1662         if (v == SEQ_START_TOKEN) {
1663                 seq_puts(seq, 
1664                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1665         } else {
1666                 const struct vif_device *vif = v;
1667                 const char *name =  vif->dev ? vif->dev->name : "none";
1668
1669                 seq_printf(seq,
1670                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1671                            vif - vif_table,
1672                            name, vif->bytes_in, vif->pkt_in, 
1673                            vif->bytes_out, vif->pkt_out,
1674                            vif->flags, vif->local, vif->remote);
1675         }
1676         return 0;
1677 }
1678
1679 static struct seq_operations ipmr_vif_seq_ops = {
1680         .start = ipmr_vif_seq_start,
1681         .next  = ipmr_vif_seq_next,
1682         .stop  = ipmr_vif_seq_stop,
1683         .show  = ipmr_vif_seq_show,
1684 };
1685
1686 static int ipmr_vif_open(struct inode *inode, struct file *file)
1687 {
1688         struct seq_file *seq;
1689         int rc = -ENOMEM;
1690         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1691        
1692         if (!s)
1693                 goto out;
1694
1695         rc = seq_open(file, &ipmr_vif_seq_ops);
1696         if (rc)
1697                 goto out_kfree;
1698
1699         s->ct = 0;
1700         seq = file->private_data;
1701         seq->private = s;
1702 out:
1703         return rc;
1704 out_kfree:
1705         kfree(s);
1706         goto out;
1707
1708 }
1709
1710 static struct file_operations ipmr_vif_fops = {
1711         .owner   = THIS_MODULE,
1712         .open    = ipmr_vif_open,
1713         .read    = seq_read,
1714         .llseek  = seq_lseek,
1715         .release = seq_release_private,
1716 };
1717
1718 struct ipmr_mfc_iter {
1719         struct mfc_cache **cache;
1720         int ct;
1721 };
1722
1723
1724 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1725 {
1726         struct mfc_cache *mfc;
1727
1728         it->cache = mfc_cache_array;
1729         read_lock(&mrt_lock);
1730         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1731                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1732                         if (pos-- == 0) 
1733                                 return mfc;
1734         read_unlock(&mrt_lock);
1735
1736         it->cache = &mfc_unres_queue;
1737         spin_lock_bh(&mfc_unres_lock);
1738         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1739                 if (pos-- == 0)
1740                         return mfc;
1741         spin_unlock_bh(&mfc_unres_lock);
1742
1743         it->cache = NULL;
1744         return NULL;
1745 }
1746
1747
1748 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1749 {
1750         struct ipmr_mfc_iter *it = seq->private;
1751         it->cache = NULL;
1752         it->ct = 0;
1753         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1754                 : SEQ_START_TOKEN;
1755 }
1756
1757 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1758 {
1759         struct mfc_cache *mfc = v;
1760         struct ipmr_mfc_iter *it = seq->private;
1761
1762         ++*pos;
1763
1764         if (v == SEQ_START_TOKEN)
1765                 return ipmr_mfc_seq_idx(seq->private, 0);
1766
1767         if (mfc->next)
1768                 return mfc->next;
1769         
1770         if (it->cache == &mfc_unres_queue) 
1771                 goto end_of_list;
1772
1773         BUG_ON(it->cache != mfc_cache_array);
1774
1775         while (++it->ct < MFC_LINES) {
1776                 mfc = mfc_cache_array[it->ct];
1777                 if (mfc)
1778                         return mfc;
1779         }
1780
1781         /* exhausted cache_array, show unresolved */
1782         read_unlock(&mrt_lock);
1783         it->cache = &mfc_unres_queue;
1784         it->ct = 0;
1785                 
1786         spin_lock_bh(&mfc_unres_lock);
1787         mfc = mfc_unres_queue;
1788         if (mfc) 
1789                 return mfc;
1790
1791  end_of_list:
1792         spin_unlock_bh(&mfc_unres_lock);
1793         it->cache = NULL;
1794
1795         return NULL;
1796 }
1797
1798 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1799 {
1800         struct ipmr_mfc_iter *it = seq->private;
1801
1802         if (it->cache == &mfc_unres_queue)
1803                 spin_unlock_bh(&mfc_unres_lock);
1804         else if (it->cache == mfc_cache_array)
1805                 read_unlock(&mrt_lock);
1806 }
1807
1808 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1809 {
1810         int n;
1811
1812         if (v == SEQ_START_TOKEN) {
1813                 seq_puts(seq, 
1814                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1815         } else {
1816                 const struct mfc_cache *mfc = v;
1817                 const struct ipmr_mfc_iter *it = seq->private;
1818                 
1819                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1820                            (unsigned long) mfc->mfc_mcastgrp,
1821                            (unsigned long) mfc->mfc_origin,
1822                            mfc->mfc_parent,
1823                            mfc->mfc_un.res.pkt,
1824                            mfc->mfc_un.res.bytes,
1825                            mfc->mfc_un.res.wrong_if);
1826
1827                 if (it->cache != &mfc_unres_queue) {
1828                         for(n = mfc->mfc_un.res.minvif; 
1829                             n < mfc->mfc_un.res.maxvif; n++ ) {
1830                                 if(VIF_EXISTS(n) 
1831                                    && mfc->mfc_un.res.ttls[n] < 255)
1832                                 seq_printf(seq, 
1833                                            " %2d:%-3d", 
1834                                            n, mfc->mfc_un.res.ttls[n]);
1835                         }
1836                 }
1837                 seq_putc(seq, '\n');
1838         }
1839         return 0;
1840 }
1841
1842 static struct seq_operations ipmr_mfc_seq_ops = {
1843         .start = ipmr_mfc_seq_start,
1844         .next  = ipmr_mfc_seq_next,
1845         .stop  = ipmr_mfc_seq_stop,
1846         .show  = ipmr_mfc_seq_show,
1847 };
1848
1849 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1850 {
1851         struct seq_file *seq;
1852         int rc = -ENOMEM;
1853         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1854        
1855         if (!s)
1856                 goto out;
1857
1858         rc = seq_open(file, &ipmr_mfc_seq_ops);
1859         if (rc)
1860                 goto out_kfree;
1861
1862         seq = file->private_data;
1863         seq->private = s;
1864 out:
1865         return rc;
1866 out_kfree:
1867         kfree(s);
1868         goto out;
1869
1870 }
1871
1872 static struct file_operations ipmr_mfc_fops = {
1873         .owner   = THIS_MODULE,
1874         .open    = ipmr_mfc_open,
1875         .read    = seq_read,
1876         .llseek  = seq_lseek,
1877         .release = seq_release_private,
1878 };
1879 #endif  
1880
1881 #ifdef CONFIG_IP_PIMSM_V2
1882 static struct net_protocol pim_protocol = {
1883         .handler        =       pim_rcv,
1884 };
1885 #endif
1886
1887
1888 /*
1889  *      Setup for IP multicast routing
1890  */
1891  
1892 void __init ip_mr_init(void)
1893 {
1894         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1895                                        sizeof(struct mfc_cache),
1896                                        0, SLAB_HWCACHE_ALIGN,
1897                                        NULL, NULL);
1898         if (!mrt_cachep)
1899                 panic("cannot allocate ip_mrt_cache");
1900
1901         init_timer(&ipmr_expire_timer);
1902         ipmr_expire_timer.function=ipmr_expire_process;
1903         register_netdevice_notifier(&ip_mr_notifier);
1904 #ifdef CONFIG_PROC_FS   
1905         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1906         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1907 #endif  
1908 }