Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland...
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <linux/if_ether.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM 1
69 #endif
70
71 static struct sock *mroute_socket;
72
73
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77
78 static DEFINE_RWLOCK(mrt_lock);
79
80 /*
81  *      Multicast router control variables
82  */
83
84 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
85 static int maxvif;
86
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88
89 static int mroute_do_assert;                            /* Set in PIM assert    */
90 static int mroute_do_pim;
91
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
93
94 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
96
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104
105    In this case data path is free of exclusive locks at all.
106  */
107
108 static kmem_cache_t *mrt_cachep __read_mostly;
109
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117
118 static struct timer_list ipmr_expire_timer;
119
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125         struct net_device  *dev;
126
127         dev = __dev_get_by_name("tunl0");
128
129         if (dev) {
130                 int err;
131                 struct ifreq ifr;
132                 mm_segment_t    oldfs;
133                 struct ip_tunnel_parm p;
134                 struct in_device  *in_dev;
135
136                 memset(&p, 0, sizeof(p));
137                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
138                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139                 p.iph.version = 4;
140                 p.iph.ihl = 5;
141                 p.iph.protocol = IPPROTO_IPIP;
142                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143                 ifr.ifr_ifru.ifru_data = (void*)&p;
144
145                 oldfs = get_fs(); set_fs(KERNEL_DS);
146                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147                 set_fs(oldfs);
148
149                 dev = NULL;
150
151                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152                         dev->flags |= IFF_MULTICAST;
153
154                         in_dev = __in_dev_get_rtnl(dev);
155                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156                                 goto failure;
157                         in_dev->cnf.rp_filter = 0;
158
159                         if (dev_open(dev))
160                                 goto failure;
161                 }
162         }
163         return dev;
164
165 failure:
166         /* allow the register to be completed before unregistering. */
167         rtnl_unlock();
168         rtnl_lock();
169
170         unregister_netdevice(dev);
171         return NULL;
172 }
173
174 #ifdef CONFIG_IP_PIMSM
175
176 static int reg_vif_num = -1;
177
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180         read_lock(&mrt_lock);
181         ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
182         ((struct net_device_stats*)dev->priv)->tx_packets++;
183         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184         read_unlock(&mrt_lock);
185         kfree_skb(skb);
186         return 0;
187 }
188
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191         return (struct net_device_stats*)dev->priv;
192 }
193
194 static void reg_vif_setup(struct net_device *dev)
195 {
196         dev->type               = ARPHRD_PIMREG;
197         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198         dev->flags              = IFF_NOARP;
199         dev->hard_start_xmit    = reg_vif_xmit;
200         dev->get_stats          = reg_vif_get_stats;
201         dev->destructor         = free_netdev;
202 }
203
204 static struct net_device *ipmr_reg_vif(void)
205 {
206         struct net_device *dev;
207         struct in_device *in_dev;
208
209         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210                            reg_vif_setup);
211
212         if (dev == NULL)
213                 return NULL;
214
215         if (register_netdevice(dev)) {
216                 free_netdev(dev);
217                 return NULL;
218         }
219         dev->iflink = 0;
220
221         if ((in_dev = inetdev_init(dev)) == NULL)
222                 goto failure;
223
224         in_dev->cnf.rp_filter = 0;
225
226         if (dev_open(dev))
227                 goto failure;
228
229         return dev;
230
231 failure:
232         /* allow the register to be completed before unregistering. */
233         rtnl_unlock();
234         rtnl_lock();
235
236         unregister_netdevice(dev);
237         return NULL;
238 }
239 #endif
240
241 /*
242  *      Delete a VIF entry
243  */
244  
245 static int vif_delete(int vifi)
246 {
247         struct vif_device *v;
248         struct net_device *dev;
249         struct in_device *in_dev;
250
251         if (vifi < 0 || vifi >= maxvif)
252                 return -EADDRNOTAVAIL;
253
254         v = &vif_table[vifi];
255
256         write_lock_bh(&mrt_lock);
257         dev = v->dev;
258         v->dev = NULL;
259
260         if (!dev) {
261                 write_unlock_bh(&mrt_lock);
262                 return -EADDRNOTAVAIL;
263         }
264
265 #ifdef CONFIG_IP_PIMSM
266         if (vifi == reg_vif_num)
267                 reg_vif_num = -1;
268 #endif
269
270         if (vifi+1 == maxvif) {
271                 int tmp;
272                 for (tmp=vifi-1; tmp>=0; tmp--) {
273                         if (VIF_EXISTS(tmp))
274                                 break;
275                 }
276                 maxvif = tmp+1;
277         }
278
279         write_unlock_bh(&mrt_lock);
280
281         dev_set_allmulti(dev, -1);
282
283         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284                 in_dev->cnf.mc_forwarding--;
285                 ip_rt_multicast_event(in_dev);
286         }
287
288         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289                 unregister_netdevice(dev);
290
291         dev_put(dev);
292         return 0;
293 }
294
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301         struct sk_buff *skb;
302         struct nlmsgerr *e;
303
304         atomic_dec(&cache_resolve_queue_len);
305
306         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307                 if (skb->nh.iph->version == 0) {
308                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309                         nlh->nlmsg_type = NLMSG_ERROR;
310                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311                         skb_trim(skb, nlh->nlmsg_len);
312                         e = NLMSG_DATA(nlh);
313                         e->error = -ETIMEDOUT;
314                         memset(&e->msg, 0, sizeof(e->msg));
315                         netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
316                 } else
317                         kfree_skb(skb);
318         }
319
320         kmem_cache_free(mrt_cachep, c);
321 }
322
323
324 /* Single timer process for all the unresolved queue. */
325
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328         unsigned long now;
329         unsigned long expires;
330         struct mfc_cache *c, **cp;
331
332         if (!spin_trylock(&mfc_unres_lock)) {
333                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334                 return;
335         }
336
337         if (atomic_read(&cache_resolve_queue_len) == 0)
338                 goto out;
339
340         now = jiffies;
341         expires = 10*HZ;
342         cp = &mfc_unres_queue;
343
344         while ((c=*cp) != NULL) {
345                 if (time_after(c->mfc_un.unres.expires, now)) {
346                         unsigned long interval = c->mfc_un.unres.expires - now;
347                         if (interval < expires)
348                                 expires = interval;
349                         cp = &c->next;
350                         continue;
351                 }
352
353                 *cp = c->next;
354
355                 ipmr_destroy_unres(c);
356         }
357
358         if (atomic_read(&cache_resolve_queue_len))
359                 mod_timer(&ipmr_expire_timer, jiffies + expires);
360
361 out:
362         spin_unlock(&mfc_unres_lock);
363 }
364
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369         int vifi;
370
371         cache->mfc_un.res.minvif = MAXVIFS;
372         cache->mfc_un.res.maxvif = 0;
373         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374
375         for (vifi=0; vifi<maxvif; vifi++) {
376                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378                         if (cache->mfc_un.res.minvif > vifi)
379                                 cache->mfc_un.res.minvif = vifi;
380                         if (cache->mfc_un.res.maxvif <= vifi)
381                                 cache->mfc_un.res.maxvif = vifi + 1;
382                 }
383         }
384 }
385
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388         int vifi = vifc->vifc_vifi;
389         struct vif_device *v = &vif_table[vifi];
390         struct net_device *dev;
391         struct in_device *in_dev;
392
393         /* Is vif busy ? */
394         if (VIF_EXISTS(vifi))
395                 return -EADDRINUSE;
396
397         switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399         case VIFF_REGISTER:
400                 /*
401                  * Special Purpose VIF in PIM
402                  * All the packets will be sent to the daemon
403                  */
404                 if (reg_vif_num >= 0)
405                         return -EADDRINUSE;
406                 dev = ipmr_reg_vif();
407                 if (!dev)
408                         return -ENOBUFS;
409                 break;
410 #endif
411         case VIFF_TUNNEL:       
412                 dev = ipmr_new_tunnel(vifc);
413                 if (!dev)
414                         return -ENOBUFS;
415                 break;
416         case 0:
417                 dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418                 if (!dev)
419                         return -EADDRNOTAVAIL;
420                 __dev_put(dev);
421                 break;
422         default:
423                 return -EINVAL;
424         }
425
426         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427                 return -EADDRNOTAVAIL;
428         in_dev->cnf.mc_forwarding++;
429         dev_set_allmulti(dev, +1);
430         ip_rt_multicast_event(in_dev);
431
432         /*
433          *      Fill in the VIF structures
434          */
435         v->rate_limit=vifc->vifc_rate_limit;
436         v->local=vifc->vifc_lcl_addr.s_addr;
437         v->remote=vifc->vifc_rmt_addr.s_addr;
438         v->flags=vifc->vifc_flags;
439         if (!mrtsock)
440                 v->flags |= VIFF_STATIC;
441         v->threshold=vifc->vifc_threshold;
442         v->bytes_in = 0;
443         v->bytes_out = 0;
444         v->pkt_in = 0;
445         v->pkt_out = 0;
446         v->link = dev->ifindex;
447         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448                 v->link = dev->iflink;
449
450         /* And finish update writing critical data */
451         write_lock_bh(&mrt_lock);
452         dev_hold(dev);
453         v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455         if (v->flags&VIFF_REGISTER)
456                 reg_vif_num = vifi;
457 #endif
458         if (vifi+1 > maxvif)
459                 maxvif = vifi+1;
460         write_unlock_bh(&mrt_lock);
461         return 0;
462 }
463
464 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
465 {
466         int line=MFC_HASH(mcastgrp,origin);
467         struct mfc_cache *c;
468
469         for (c=mfc_cache_array[line]; c; c = c->next) {
470                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471                         break;
472         }
473         return c;
474 }
475
476 /*
477  *      Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
482         if(c==NULL)
483                 return NULL;
484         memset(c, 0, sizeof(*c));
485         c->mfc_un.res.minvif = MAXVIFS;
486         return c;
487 }
488
489 static struct mfc_cache *ipmr_cache_alloc_unres(void)
490 {
491         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
492         if(c==NULL)
493                 return NULL;
494         memset(c, 0, sizeof(*c));
495         skb_queue_head_init(&c->mfc_un.unres.unresolved);
496         c->mfc_un.unres.expires = jiffies + 10*HZ;
497         return c;
498 }
499
500 /*
501  *      A cache entry has gone into a resolved state from queued
502  */
503  
504 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
505 {
506         struct sk_buff *skb;
507         struct nlmsgerr *e;
508
509         /*
510          *      Play the pending entries through our router
511          */
512
513         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
514                 if (skb->nh.iph->version == 0) {
515                         int err;
516                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
517
518                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
519                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
520                         } else {
521                                 nlh->nlmsg_type = NLMSG_ERROR;
522                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
523                                 skb_trim(skb, nlh->nlmsg_len);
524                                 e = NLMSG_DATA(nlh);
525                                 e->error = -EMSGSIZE;
526                                 memset(&e->msg, 0, sizeof(e->msg));
527                         }
528                         err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
529                 } else
530                         ip_mr_forward(skb, c, 0);
531         }
532 }
533
534 /*
535  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
536  *      expects the following bizarre scheme.
537  *
538  *      Called under mrt_lock.
539  */
540  
541 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
542 {
543         struct sk_buff *skb;
544         int ihl = pkt->nh.iph->ihl<<2;
545         struct igmphdr *igmp;
546         struct igmpmsg *msg;
547         int ret;
548
549 #ifdef CONFIG_IP_PIMSM
550         if (assert == IGMPMSG_WHOLEPKT)
551                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
552         else
553 #endif
554                 skb = alloc_skb(128, GFP_ATOMIC);
555
556         if(!skb)
557                 return -ENOBUFS;
558
559 #ifdef CONFIG_IP_PIMSM
560         if (assert == IGMPMSG_WHOLEPKT) {
561                 /* Ugly, but we have no choice with this interface.
562                    Duplicate old header, fix ihl, length etc.
563                    And all this only to mangle msg->im_msgtype and
564                    to set msg->im_mbz to "mbz" :-)
565                  */
566                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
567                 skb->nh.raw = skb->h.raw = (u8*)msg;
568                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
569                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
570                 msg->im_mbz = 0;
571                 msg->im_vif = reg_vif_num;
572                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
573                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
574         } else 
575 #endif
576         {       
577                 
578         /*
579          *      Copy the IP header
580          */
581
582         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
583         memcpy(skb->data,pkt->data,ihl);
584         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
585         msg = (struct igmpmsg*)skb->nh.iph;
586         msg->im_vif = vifi;
587         skb->dst = dst_clone(pkt->dst);
588
589         /*
590          *      Add our header
591          */
592
593         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
594         igmp->type      =
595         msg->im_msgtype = assert;
596         igmp->code      =       0;
597         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
598         skb->h.raw = skb->nh.raw;
599         }
600
601         if (mroute_socket == NULL) {
602                 kfree_skb(skb);
603                 return -EINVAL;
604         }
605
606         /*
607          *      Deliver to mrouted
608          */
609         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
610                 if (net_ratelimit())
611                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
612                 kfree_skb(skb);
613         }
614
615         return ret;
616 }
617
618 /*
619  *      Queue a packet for resolution. It gets locked cache entry!
620  */
621  
622 static int
623 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
624 {
625         int err;
626         struct mfc_cache *c;
627
628         spin_lock_bh(&mfc_unres_lock);
629         for (c=mfc_unres_queue; c; c=c->next) {
630                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
631                     c->mfc_origin == skb->nh.iph->saddr)
632                         break;
633         }
634
635         if (c == NULL) {
636                 /*
637                  *      Create a new entry if allowable
638                  */
639
640                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
641                     (c=ipmr_cache_alloc_unres())==NULL) {
642                         spin_unlock_bh(&mfc_unres_lock);
643
644                         kfree_skb(skb);
645                         return -ENOBUFS;
646                 }
647
648                 /*
649                  *      Fill in the new cache entry
650                  */
651                 c->mfc_parent=-1;
652                 c->mfc_origin=skb->nh.iph->saddr;
653                 c->mfc_mcastgrp=skb->nh.iph->daddr;
654
655                 /*
656                  *      Reflect first query at mrouted.
657                  */
658                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
659                         /* If the report failed throw the cache entry 
660                            out - Brad Parker
661                          */
662                         spin_unlock_bh(&mfc_unres_lock);
663
664                         kmem_cache_free(mrt_cachep, c);
665                         kfree_skb(skb);
666                         return err;
667                 }
668
669                 atomic_inc(&cache_resolve_queue_len);
670                 c->next = mfc_unres_queue;
671                 mfc_unres_queue = c;
672
673                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
674         }
675
676         /*
677          *      See if we can append the packet
678          */
679         if (c->mfc_un.unres.unresolved.qlen>3) {
680                 kfree_skb(skb);
681                 err = -ENOBUFS;
682         } else {
683                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
684                 err = 0;
685         }
686
687         spin_unlock_bh(&mfc_unres_lock);
688         return err;
689 }
690
691 /*
692  *      MFC cache manipulation by user space mroute daemon
693  */
694
695 static int ipmr_mfc_delete(struct mfcctl *mfc)
696 {
697         int line;
698         struct mfc_cache *c, **cp;
699
700         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
701
702         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
703                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
704                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
705                         write_lock_bh(&mrt_lock);
706                         *cp = c->next;
707                         write_unlock_bh(&mrt_lock);
708
709                         kmem_cache_free(mrt_cachep, c);
710                         return 0;
711                 }
712         }
713         return -ENOENT;
714 }
715
716 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
717 {
718         int line;
719         struct mfc_cache *uc, *c, **cp;
720
721         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
722
723         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
724                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
725                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
726                         break;
727         }
728
729         if (c != NULL) {
730                 write_lock_bh(&mrt_lock);
731                 c->mfc_parent = mfc->mfcc_parent;
732                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
733                 if (!mrtsock)
734                         c->mfc_flags |= MFC_STATIC;
735                 write_unlock_bh(&mrt_lock);
736                 return 0;
737         }
738
739         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
740                 return -EINVAL;
741
742         c=ipmr_cache_alloc();
743         if (c==NULL)
744                 return -ENOMEM;
745
746         c->mfc_origin=mfc->mfcc_origin.s_addr;
747         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
748         c->mfc_parent=mfc->mfcc_parent;
749         ipmr_update_thresholds(c, mfc->mfcc_ttls);
750         if (!mrtsock)
751                 c->mfc_flags |= MFC_STATIC;
752
753         write_lock_bh(&mrt_lock);
754         c->next = mfc_cache_array[line];
755         mfc_cache_array[line] = c;
756         write_unlock_bh(&mrt_lock);
757
758         /*
759          *      Check to see if we resolved a queued list. If so we
760          *      need to send on the frames and tidy up.
761          */
762         spin_lock_bh(&mfc_unres_lock);
763         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
764              cp = &uc->next) {
765                 if (uc->mfc_origin == c->mfc_origin &&
766                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
767                         *cp = uc->next;
768                         if (atomic_dec_and_test(&cache_resolve_queue_len))
769                                 del_timer(&ipmr_expire_timer);
770                         break;
771                 }
772         }
773         spin_unlock_bh(&mfc_unres_lock);
774
775         if (uc) {
776                 ipmr_cache_resolve(uc, c);
777                 kmem_cache_free(mrt_cachep, uc);
778         }
779         return 0;
780 }
781
782 /*
783  *      Close the multicast socket, and clear the vif tables etc
784  */
785  
786 static void mroute_clean_tables(struct sock *sk)
787 {
788         int i;
789                 
790         /*
791          *      Shut down all active vif entries
792          */
793         for(i=0; i<maxvif; i++) {
794                 if (!(vif_table[i].flags&VIFF_STATIC))
795                         vif_delete(i);
796         }
797
798         /*
799          *      Wipe the cache
800          */
801         for (i=0;i<MFC_LINES;i++) {
802                 struct mfc_cache *c, **cp;
803
804                 cp = &mfc_cache_array[i];
805                 while ((c = *cp) != NULL) {
806                         if (c->mfc_flags&MFC_STATIC) {
807                                 cp = &c->next;
808                                 continue;
809                         }
810                         write_lock_bh(&mrt_lock);
811                         *cp = c->next;
812                         write_unlock_bh(&mrt_lock);
813
814                         kmem_cache_free(mrt_cachep, c);
815                 }
816         }
817
818         if (atomic_read(&cache_resolve_queue_len) != 0) {
819                 struct mfc_cache *c;
820
821                 spin_lock_bh(&mfc_unres_lock);
822                 while (mfc_unres_queue != NULL) {
823                         c = mfc_unres_queue;
824                         mfc_unres_queue = c->next;
825                         spin_unlock_bh(&mfc_unres_lock);
826
827                         ipmr_destroy_unres(c);
828
829                         spin_lock_bh(&mfc_unres_lock);
830                 }
831                 spin_unlock_bh(&mfc_unres_lock);
832         }
833 }
834
835 static void mrtsock_destruct(struct sock *sk)
836 {
837         rtnl_lock();
838         if (sk == mroute_socket) {
839                 ipv4_devconf.mc_forwarding--;
840
841                 write_lock_bh(&mrt_lock);
842                 mroute_socket=NULL;
843                 write_unlock_bh(&mrt_lock);
844
845                 mroute_clean_tables(sk);
846         }
847         rtnl_unlock();
848 }
849
850 /*
851  *      Socket options and virtual interface manipulation. The whole
852  *      virtual interface system is a complete heap, but unfortunately
853  *      that's how BSD mrouted happens to think. Maybe one day with a proper
854  *      MOSPF/PIM router set up we can clean this up.
855  */
856  
857 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
858 {
859         int ret;
860         struct vifctl vif;
861         struct mfcctl mfc;
862         
863         if(optname!=MRT_INIT)
864         {
865                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
866                         return -EACCES;
867         }
868
869         switch(optname)
870         {
871                 case MRT_INIT:
872                         if (sk->sk_type != SOCK_RAW ||
873                             inet_sk(sk)->num != IPPROTO_IGMP)
874                                 return -EOPNOTSUPP;
875                         if(optlen!=sizeof(int))
876                                 return -ENOPROTOOPT;
877
878                         rtnl_lock();
879                         if (mroute_socket) {
880                                 rtnl_unlock();
881                                 return -EADDRINUSE;
882                         }
883
884                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
885                         if (ret == 0) {
886                                 write_lock_bh(&mrt_lock);
887                                 mroute_socket=sk;
888                                 write_unlock_bh(&mrt_lock);
889
890                                 ipv4_devconf.mc_forwarding++;
891                         }
892                         rtnl_unlock();
893                         return ret;
894                 case MRT_DONE:
895                         if (sk!=mroute_socket)
896                                 return -EACCES;
897                         return ip_ra_control(sk, 0, NULL);
898                 case MRT_ADD_VIF:
899                 case MRT_DEL_VIF:
900                         if(optlen!=sizeof(vif))
901                                 return -EINVAL;
902                         if (copy_from_user(&vif,optval,sizeof(vif)))
903                                 return -EFAULT; 
904                         if(vif.vifc_vifi >= MAXVIFS)
905                                 return -ENFILE;
906                         rtnl_lock();
907                         if (optname==MRT_ADD_VIF) {
908                                 ret = vif_add(&vif, sk==mroute_socket);
909                         } else {
910                                 ret = vif_delete(vif.vifc_vifi);
911                         }
912                         rtnl_unlock();
913                         return ret;
914
915                 /*
916                  *      Manipulate the forwarding caches. These live
917                  *      in a sort of kernel/user symbiosis.
918                  */
919                 case MRT_ADD_MFC:
920                 case MRT_DEL_MFC:
921                         if(optlen!=sizeof(mfc))
922                                 return -EINVAL;
923                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
924                                 return -EFAULT;
925                         rtnl_lock();
926                         if (optname==MRT_DEL_MFC)
927                                 ret = ipmr_mfc_delete(&mfc);
928                         else
929                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
930                         rtnl_unlock();
931                         return ret;
932                 /*
933                  *      Control PIM assert.
934                  */
935                 case MRT_ASSERT:
936                 {
937                         int v;
938                         if(get_user(v,(int __user *)optval))
939                                 return -EFAULT;
940                         mroute_do_assert=(v)?1:0;
941                         return 0;
942                 }
943 #ifdef CONFIG_IP_PIMSM
944                 case MRT_PIM:
945                 {
946                         int v, ret;
947                         if(get_user(v,(int __user *)optval))
948                                 return -EFAULT;
949                         v = (v)?1:0;
950                         rtnl_lock();
951                         ret = 0;
952                         if (v != mroute_do_pim) {
953                                 mroute_do_pim = v;
954                                 mroute_do_assert = v;
955 #ifdef CONFIG_IP_PIMSM_V2
956                                 if (mroute_do_pim)
957                                         ret = inet_add_protocol(&pim_protocol,
958                                                                 IPPROTO_PIM);
959                                 else
960                                         ret = inet_del_protocol(&pim_protocol,
961                                                                 IPPROTO_PIM);
962                                 if (ret < 0)
963                                         ret = -EAGAIN;
964 #endif
965                         }
966                         rtnl_unlock();
967                         return ret;
968                 }
969 #endif
970                 /*
971                  *      Spurious command, or MRT_VERSION which you cannot
972                  *      set.
973                  */
974                 default:
975                         return -ENOPROTOOPT;
976         }
977 }
978
979 /*
980  *      Getsock opt support for the multicast routing system.
981  */
982  
983 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
984 {
985         int olr;
986         int val;
987
988         if(optname!=MRT_VERSION && 
989 #ifdef CONFIG_IP_PIMSM
990            optname!=MRT_PIM &&
991 #endif
992            optname!=MRT_ASSERT)
993                 return -ENOPROTOOPT;
994
995         if (get_user(olr, optlen))
996                 return -EFAULT;
997
998         olr = min_t(unsigned int, olr, sizeof(int));
999         if (olr < 0)
1000                 return -EINVAL;
1001                 
1002         if(put_user(olr,optlen))
1003                 return -EFAULT;
1004         if(optname==MRT_VERSION)
1005                 val=0x0305;
1006 #ifdef CONFIG_IP_PIMSM
1007         else if(optname==MRT_PIM)
1008                 val=mroute_do_pim;
1009 #endif
1010         else
1011                 val=mroute_do_assert;
1012         if(copy_to_user(optval,&val,olr))
1013                 return -EFAULT;
1014         return 0;
1015 }
1016
1017 /*
1018  *      The IP multicast ioctl support routines.
1019  */
1020  
1021 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1022 {
1023         struct sioc_sg_req sr;
1024         struct sioc_vif_req vr;
1025         struct vif_device *vif;
1026         struct mfc_cache *c;
1027         
1028         switch(cmd)
1029         {
1030                 case SIOCGETVIFCNT:
1031                         if (copy_from_user(&vr,arg,sizeof(vr)))
1032                                 return -EFAULT; 
1033                         if(vr.vifi>=maxvif)
1034                                 return -EINVAL;
1035                         read_lock(&mrt_lock);
1036                         vif=&vif_table[vr.vifi];
1037                         if(VIF_EXISTS(vr.vifi)) {
1038                                 vr.icount=vif->pkt_in;
1039                                 vr.ocount=vif->pkt_out;
1040                                 vr.ibytes=vif->bytes_in;
1041                                 vr.obytes=vif->bytes_out;
1042                                 read_unlock(&mrt_lock);
1043
1044                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1045                                         return -EFAULT;
1046                                 return 0;
1047                         }
1048                         read_unlock(&mrt_lock);
1049                         return -EADDRNOTAVAIL;
1050                 case SIOCGETSGCNT:
1051                         if (copy_from_user(&sr,arg,sizeof(sr)))
1052                                 return -EFAULT;
1053
1054                         read_lock(&mrt_lock);
1055                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1056                         if (c) {
1057                                 sr.pktcnt = c->mfc_un.res.pkt;
1058                                 sr.bytecnt = c->mfc_un.res.bytes;
1059                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1060                                 read_unlock(&mrt_lock);
1061
1062                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1063                                         return -EFAULT;
1064                                 return 0;
1065                         }
1066                         read_unlock(&mrt_lock);
1067                         return -EADDRNOTAVAIL;
1068                 default:
1069                         return -ENOIOCTLCMD;
1070         }
1071 }
1072
1073
1074 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1075 {
1076         struct vif_device *v;
1077         int ct;
1078         if (event != NETDEV_UNREGISTER)
1079                 return NOTIFY_DONE;
1080         v=&vif_table[0];
1081         for(ct=0;ct<maxvif;ct++,v++) {
1082                 if (v->dev==ptr)
1083                         vif_delete(ct);
1084         }
1085         return NOTIFY_DONE;
1086 }
1087
1088
1089 static struct notifier_block ip_mr_notifier={
1090         .notifier_call = ipmr_device_event,
1091 };
1092
1093 /*
1094  *      Encapsulate a packet by attaching a valid IPIP header to it.
1095  *      This avoids tunnel drivers and other mess and gives us the speed so
1096  *      important for multicast video.
1097  */
1098  
1099 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1100 {
1101         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1102
1103         iph->version    =       4;
1104         iph->tos        =       skb->nh.iph->tos;
1105         iph->ttl        =       skb->nh.iph->ttl;
1106         iph->frag_off   =       0;
1107         iph->daddr      =       daddr;
1108         iph->saddr      =       saddr;
1109         iph->protocol   =       IPPROTO_IPIP;
1110         iph->ihl        =       5;
1111         iph->tot_len    =       htons(skb->len);
1112         ip_select_ident(iph, skb->dst, NULL);
1113         ip_send_check(iph);
1114
1115         skb->h.ipiph = skb->nh.iph;
1116         skb->nh.iph = iph;
1117         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1118         nf_reset(skb);
1119 }
1120
1121 static inline int ipmr_forward_finish(struct sk_buff *skb)
1122 {
1123         struct ip_options * opt = &(IPCB(skb)->opt);
1124
1125         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1126
1127         if (unlikely(opt->optlen))
1128                 ip_forward_options(skb);
1129
1130         return dst_output(skb);
1131 }
1132
1133 /*
1134  *      Processing handlers for ipmr_forward
1135  */
1136
1137 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1138 {
1139         struct iphdr *iph = skb->nh.iph;
1140         struct vif_device *vif = &vif_table[vifi];
1141         struct net_device *dev;
1142         struct rtable *rt;
1143         int    encap = 0;
1144
1145         if (vif->dev == NULL)
1146                 goto out_free;
1147
1148 #ifdef CONFIG_IP_PIMSM
1149         if (vif->flags & VIFF_REGISTER) {
1150                 vif->pkt_out++;
1151                 vif->bytes_out+=skb->len;
1152                 ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1153                 ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1154                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1155                 kfree_skb(skb);
1156                 return;
1157         }
1158 #endif
1159
1160         if (vif->flags&VIFF_TUNNEL) {
1161                 struct flowi fl = { .oif = vif->link,
1162                                     .nl_u = { .ip4_u =
1163                                               { .daddr = vif->remote,
1164                                                 .saddr = vif->local,
1165                                                 .tos = RT_TOS(iph->tos) } },
1166                                     .proto = IPPROTO_IPIP };
1167                 if (ip_route_output_key(&rt, &fl))
1168                         goto out_free;
1169                 encap = sizeof(struct iphdr);
1170         } else {
1171                 struct flowi fl = { .oif = vif->link,
1172                                     .nl_u = { .ip4_u =
1173                                               { .daddr = iph->daddr,
1174                                                 .tos = RT_TOS(iph->tos) } },
1175                                     .proto = IPPROTO_IPIP };
1176                 if (ip_route_output_key(&rt, &fl))
1177                         goto out_free;
1178         }
1179
1180         dev = rt->u.dst.dev;
1181
1182         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1183                 /* Do not fragment multicasts. Alas, IPv4 does not
1184                    allow to send ICMP, so that packets will disappear
1185                    to blackhole.
1186                  */
1187
1188                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1189                 ip_rt_put(rt);
1190                 goto out_free;
1191         }
1192
1193         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1194
1195         if (skb_cow(skb, encap)) {
1196                 ip_rt_put(rt);
1197                 goto out_free;
1198         }
1199
1200         vif->pkt_out++;
1201         vif->bytes_out+=skb->len;
1202
1203         dst_release(skb->dst);
1204         skb->dst = &rt->u.dst;
1205         iph = skb->nh.iph;
1206         ip_decrease_ttl(iph);
1207
1208         /* FIXME: forward and output firewalls used to be called here.
1209          * What do we do with netfilter? -- RR */
1210         if (vif->flags & VIFF_TUNNEL) {
1211                 ip_encap(skb, vif->local, vif->remote);
1212                 /* FIXME: extra output firewall step used to be here. --RR */
1213                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1214                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1215         }
1216
1217         IPCB(skb)->flags |= IPSKB_FORWARDED;
1218
1219         /*
1220          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1221          * not only before forwarding, but after forwarding on all output
1222          * interfaces. It is clear, if mrouter runs a multicasting
1223          * program, it should receive packets not depending to what interface
1224          * program is joined.
1225          * If we will not make it, the program will have to join on all
1226          * interfaces. On the other hand, multihoming host (or router, but
1227          * not mrouter) cannot join to more than one interface - it will
1228          * result in receiving multiple packets.
1229          */
1230         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1231                 ipmr_forward_finish);
1232         return;
1233
1234 out_free:
1235         kfree_skb(skb);
1236         return;
1237 }
1238
1239 static int ipmr_find_vif(struct net_device *dev)
1240 {
1241         int ct;
1242         for (ct=maxvif-1; ct>=0; ct--) {
1243                 if (vif_table[ct].dev == dev)
1244                         break;
1245         }
1246         return ct;
1247 }
1248
1249 /* "local" means that we should preserve one skb (for local delivery) */
1250
1251 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1252 {
1253         int psend = -1;
1254         int vif, ct;
1255
1256         vif = cache->mfc_parent;
1257         cache->mfc_un.res.pkt++;
1258         cache->mfc_un.res.bytes += skb->len;
1259
1260         /*
1261          * Wrong interface: drop packet and (maybe) send PIM assert.
1262          */
1263         if (vif_table[vif].dev != skb->dev) {
1264                 int true_vifi;
1265
1266                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1267                         /* It is our own packet, looped back.
1268                            Very complicated situation...
1269
1270                            The best workaround until routing daemons will be
1271                            fixed is not to redistribute packet, if it was
1272                            send through wrong interface. It means, that
1273                            multicast applications WILL NOT work for
1274                            (S,G), which have default multicast route pointing
1275                            to wrong oif. In any case, it is not a good
1276                            idea to use multicasting applications on router.
1277                          */
1278                         goto dont_forward;
1279                 }
1280
1281                 cache->mfc_un.res.wrong_if++;
1282                 true_vifi = ipmr_find_vif(skb->dev);
1283
1284                 if (true_vifi >= 0 && mroute_do_assert &&
1285                     /* pimsm uses asserts, when switching from RPT to SPT,
1286                        so that we cannot check that packet arrived on an oif.
1287                        It is bad, but otherwise we would need to move pretty
1288                        large chunk of pimd to kernel. Ough... --ANK
1289                      */
1290                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1291                     time_after(jiffies, 
1292                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1293                         cache->mfc_un.res.last_assert = jiffies;
1294                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1295                 }
1296                 goto dont_forward;
1297         }
1298
1299         vif_table[vif].pkt_in++;
1300         vif_table[vif].bytes_in+=skb->len;
1301
1302         /*
1303          *      Forward the frame
1304          */
1305         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1306                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1307                         if (psend != -1) {
1308                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1309                                 if (skb2)
1310                                         ipmr_queue_xmit(skb2, cache, psend);
1311                         }
1312                         psend=ct;
1313                 }
1314         }
1315         if (psend != -1) {
1316                 if (local) {
1317                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1318                         if (skb2)
1319                                 ipmr_queue_xmit(skb2, cache, psend);
1320                 } else {
1321                         ipmr_queue_xmit(skb, cache, psend);
1322                         return 0;
1323                 }
1324         }
1325
1326 dont_forward:
1327         if (!local)
1328                 kfree_skb(skb);
1329         return 0;
1330 }
1331
1332
1333 /*
1334  *      Multicast packets for forwarding arrive here
1335  */
1336
1337 int ip_mr_input(struct sk_buff *skb)
1338 {
1339         struct mfc_cache *cache;
1340         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1341
1342         /* Packet is looped back after forward, it should not be
1343            forwarded second time, but still can be delivered locally.
1344          */
1345         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1346                 goto dont_forward;
1347
1348         if (!local) {
1349                     if (IPCB(skb)->opt.router_alert) {
1350                             if (ip_call_ra_chain(skb))
1351                                     return 0;
1352                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1353                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1354                                Cisco IOS <= 11.2(8)) do not put router alert
1355                                option to IGMP packets destined to routable
1356                                groups. It is very bad, because it means
1357                                that we can forward NO IGMP messages.
1358                              */
1359                             read_lock(&mrt_lock);
1360                             if (mroute_socket) {
1361                                     nf_reset(skb);
1362                                     raw_rcv(mroute_socket, skb);
1363                                     read_unlock(&mrt_lock);
1364                                     return 0;
1365                             }
1366                             read_unlock(&mrt_lock);
1367                     }
1368         }
1369
1370         read_lock(&mrt_lock);
1371         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1372
1373         /*
1374          *      No usable cache entry
1375          */
1376         if (cache==NULL) {
1377                 int vif;
1378
1379                 if (local) {
1380                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1381                         ip_local_deliver(skb);
1382                         if (skb2 == NULL) {
1383                                 read_unlock(&mrt_lock);
1384                                 return -ENOBUFS;
1385                         }
1386                         skb = skb2;
1387                 }
1388
1389                 vif = ipmr_find_vif(skb->dev);
1390                 if (vif >= 0) {
1391                         int err = ipmr_cache_unresolved(vif, skb);
1392                         read_unlock(&mrt_lock);
1393
1394                         return err;
1395                 }
1396                 read_unlock(&mrt_lock);
1397                 kfree_skb(skb);
1398                 return -ENODEV;
1399         }
1400
1401         ip_mr_forward(skb, cache, local);
1402
1403         read_unlock(&mrt_lock);
1404
1405         if (local)
1406                 return ip_local_deliver(skb);
1407
1408         return 0;
1409
1410 dont_forward:
1411         if (local)
1412                 return ip_local_deliver(skb);
1413         kfree_skb(skb);
1414         return 0;
1415 }
1416
1417 #ifdef CONFIG_IP_PIMSM_V1
1418 /*
1419  * Handle IGMP messages of PIMv1
1420  */
1421
1422 int pim_rcv_v1(struct sk_buff * skb)
1423 {
1424         struct igmphdr *pim;
1425         struct iphdr   *encap;
1426         struct net_device  *reg_dev = NULL;
1427
1428         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1429                 goto drop;
1430
1431         pim = (struct igmphdr*)skb->h.raw;
1432
1433         if (!mroute_do_pim ||
1434             skb->len < sizeof(*pim) + sizeof(*encap) ||
1435             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1436                 goto drop;
1437
1438         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1439         /*
1440            Check that:
1441            a. packet is really destinted to a multicast group
1442            b. packet is not a NULL-REGISTER
1443            c. packet is not truncated
1444          */
1445         if (!MULTICAST(encap->daddr) ||
1446             encap->tot_len == 0 ||
1447             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1448                 goto drop;
1449
1450         read_lock(&mrt_lock);
1451         if (reg_vif_num >= 0)
1452                 reg_dev = vif_table[reg_vif_num].dev;
1453         if (reg_dev)
1454                 dev_hold(reg_dev);
1455         read_unlock(&mrt_lock);
1456
1457         if (reg_dev == NULL) 
1458                 goto drop;
1459
1460         skb->mac.raw = skb->nh.raw;
1461         skb_pull(skb, (u8*)encap - skb->data);
1462         skb->nh.iph = (struct iphdr *)skb->data;
1463         skb->dev = reg_dev;
1464         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1465         skb->protocol = htons(ETH_P_IP);
1466         skb->ip_summed = 0;
1467         skb->pkt_type = PACKET_HOST;
1468         dst_release(skb->dst);
1469         skb->dst = NULL;
1470         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1471         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1472         nf_reset(skb);
1473         netif_rx(skb);
1474         dev_put(reg_dev);
1475         return 0;
1476  drop:
1477         kfree_skb(skb);
1478         return 0;
1479 }
1480 #endif
1481
1482 #ifdef CONFIG_IP_PIMSM_V2
1483 static int pim_rcv(struct sk_buff * skb)
1484 {
1485         struct pimreghdr *pim;
1486         struct iphdr   *encap;
1487         struct net_device  *reg_dev = NULL;
1488
1489         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1490                 goto drop;
1491
1492         pim = (struct pimreghdr*)skb->h.raw;
1493         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1494             (pim->flags&PIM_NULL_REGISTER) ||
1495             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1496              (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
1497                 goto drop;
1498
1499         /* check if the inner packet is destined to mcast group */
1500         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1501         if (!MULTICAST(encap->daddr) ||
1502             encap->tot_len == 0 ||
1503             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1504                 goto drop;
1505
1506         read_lock(&mrt_lock);
1507         if (reg_vif_num >= 0)
1508                 reg_dev = vif_table[reg_vif_num].dev;
1509         if (reg_dev)
1510                 dev_hold(reg_dev);
1511         read_unlock(&mrt_lock);
1512
1513         if (reg_dev == NULL) 
1514                 goto drop;
1515
1516         skb->mac.raw = skb->nh.raw;
1517         skb_pull(skb, (u8*)encap - skb->data);
1518         skb->nh.iph = (struct iphdr *)skb->data;
1519         skb->dev = reg_dev;
1520         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1521         skb->protocol = htons(ETH_P_IP);
1522         skb->ip_summed = 0;
1523         skb->pkt_type = PACKET_HOST;
1524         dst_release(skb->dst);
1525         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1526         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1527         skb->dst = NULL;
1528         nf_reset(skb);
1529         netif_rx(skb);
1530         dev_put(reg_dev);
1531         return 0;
1532  drop:
1533         kfree_skb(skb);
1534         return 0;
1535 }
1536 #endif
1537
1538 static int
1539 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1540 {
1541         int ct;
1542         struct rtnexthop *nhp;
1543         struct net_device *dev = vif_table[c->mfc_parent].dev;
1544         u8 *b = skb->tail;
1545         struct rtattr *mp_head;
1546
1547         if (dev)
1548                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1549
1550         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1551
1552         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1553                 if (c->mfc_un.res.ttls[ct] < 255) {
1554                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1555                                 goto rtattr_failure;
1556                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1557                         nhp->rtnh_flags = 0;
1558                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1559                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1560                         nhp->rtnh_len = sizeof(*nhp);
1561                 }
1562         }
1563         mp_head->rta_type = RTA_MULTIPATH;
1564         mp_head->rta_len = skb->tail - (u8*)mp_head;
1565         rtm->rtm_type = RTN_MULTICAST;
1566         return 1;
1567
1568 rtattr_failure:
1569         skb_trim(skb, b - skb->data);
1570         return -EMSGSIZE;
1571 }
1572
1573 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1574 {
1575         int err;
1576         struct mfc_cache *cache;
1577         struct rtable *rt = (struct rtable*)skb->dst;
1578
1579         read_lock(&mrt_lock);
1580         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1581
1582         if (cache==NULL) {
1583                 struct net_device *dev;
1584                 int vif;
1585
1586                 if (nowait) {
1587                         read_unlock(&mrt_lock);
1588                         return -EAGAIN;
1589                 }
1590
1591                 dev = skb->dev;
1592                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1593                         read_unlock(&mrt_lock);
1594                         return -ENODEV;
1595                 }
1596                 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1597                 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1598                 skb->nh.iph->saddr = rt->rt_src;
1599                 skb->nh.iph->daddr = rt->rt_dst;
1600                 skb->nh.iph->version = 0;
1601                 err = ipmr_cache_unresolved(vif, skb);
1602                 read_unlock(&mrt_lock);
1603                 return err;
1604         }
1605
1606         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1607                 cache->mfc_flags |= MFC_NOTIFY;
1608         err = ipmr_fill_mroute(skb, cache, rtm);
1609         read_unlock(&mrt_lock);
1610         return err;
1611 }
1612
1613 #ifdef CONFIG_PROC_FS   
1614 /*
1615  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1616  */
1617 struct ipmr_vif_iter {
1618         int ct;
1619 };
1620
1621 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1622                                            loff_t pos)
1623 {
1624         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1625                 if(!VIF_EXISTS(iter->ct))
1626                         continue;
1627                 if (pos-- == 0) 
1628                         return &vif_table[iter->ct];
1629         }
1630         return NULL;
1631 }
1632
1633 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1634 {
1635         read_lock(&mrt_lock);
1636         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1637                 : SEQ_START_TOKEN;
1638 }
1639
1640 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1641 {
1642         struct ipmr_vif_iter *iter = seq->private;
1643
1644         ++*pos;
1645         if (v == SEQ_START_TOKEN)
1646                 return ipmr_vif_seq_idx(iter, 0);
1647         
1648         while (++iter->ct < maxvif) {
1649                 if(!VIF_EXISTS(iter->ct))
1650                         continue;
1651                 return &vif_table[iter->ct];
1652         }
1653         return NULL;
1654 }
1655
1656 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1657 {
1658         read_unlock(&mrt_lock);
1659 }
1660
1661 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1662 {
1663         if (v == SEQ_START_TOKEN) {
1664                 seq_puts(seq, 
1665                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1666         } else {
1667                 const struct vif_device *vif = v;
1668                 const char *name =  vif->dev ? vif->dev->name : "none";
1669
1670                 seq_printf(seq,
1671                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1672                            vif - vif_table,
1673                            name, vif->bytes_in, vif->pkt_in, 
1674                            vif->bytes_out, vif->pkt_out,
1675                            vif->flags, vif->local, vif->remote);
1676         }
1677         return 0;
1678 }
1679
1680 static struct seq_operations ipmr_vif_seq_ops = {
1681         .start = ipmr_vif_seq_start,
1682         .next  = ipmr_vif_seq_next,
1683         .stop  = ipmr_vif_seq_stop,
1684         .show  = ipmr_vif_seq_show,
1685 };
1686
1687 static int ipmr_vif_open(struct inode *inode, struct file *file)
1688 {
1689         struct seq_file *seq;
1690         int rc = -ENOMEM;
1691         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1692        
1693         if (!s)
1694                 goto out;
1695
1696         rc = seq_open(file, &ipmr_vif_seq_ops);
1697         if (rc)
1698                 goto out_kfree;
1699
1700         s->ct = 0;
1701         seq = file->private_data;
1702         seq->private = s;
1703 out:
1704         return rc;
1705 out_kfree:
1706         kfree(s);
1707         goto out;
1708
1709 }
1710
1711 static struct file_operations ipmr_vif_fops = {
1712         .owner   = THIS_MODULE,
1713         .open    = ipmr_vif_open,
1714         .read    = seq_read,
1715         .llseek  = seq_lseek,
1716         .release = seq_release_private,
1717 };
1718
1719 struct ipmr_mfc_iter {
1720         struct mfc_cache **cache;
1721         int ct;
1722 };
1723
1724
1725 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1726 {
1727         struct mfc_cache *mfc;
1728
1729         it->cache = mfc_cache_array;
1730         read_lock(&mrt_lock);
1731         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1732                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1733                         if (pos-- == 0) 
1734                                 return mfc;
1735         read_unlock(&mrt_lock);
1736
1737         it->cache = &mfc_unres_queue;
1738         spin_lock_bh(&mfc_unres_lock);
1739         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1740                 if (pos-- == 0)
1741                         return mfc;
1742         spin_unlock_bh(&mfc_unres_lock);
1743
1744         it->cache = NULL;
1745         return NULL;
1746 }
1747
1748
1749 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1750 {
1751         struct ipmr_mfc_iter *it = seq->private;
1752         it->cache = NULL;
1753         it->ct = 0;
1754         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1755                 : SEQ_START_TOKEN;
1756 }
1757
1758 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1759 {
1760         struct mfc_cache *mfc = v;
1761         struct ipmr_mfc_iter *it = seq->private;
1762
1763         ++*pos;
1764
1765         if (v == SEQ_START_TOKEN)
1766                 return ipmr_mfc_seq_idx(seq->private, 0);
1767
1768         if (mfc->next)
1769                 return mfc->next;
1770         
1771         if (it->cache == &mfc_unres_queue) 
1772                 goto end_of_list;
1773
1774         BUG_ON(it->cache != mfc_cache_array);
1775
1776         while (++it->ct < MFC_LINES) {
1777                 mfc = mfc_cache_array[it->ct];
1778                 if (mfc)
1779                         return mfc;
1780         }
1781
1782         /* exhausted cache_array, show unresolved */
1783         read_unlock(&mrt_lock);
1784         it->cache = &mfc_unres_queue;
1785         it->ct = 0;
1786                 
1787         spin_lock_bh(&mfc_unres_lock);
1788         mfc = mfc_unres_queue;
1789         if (mfc) 
1790                 return mfc;
1791
1792  end_of_list:
1793         spin_unlock_bh(&mfc_unres_lock);
1794         it->cache = NULL;
1795
1796         return NULL;
1797 }
1798
1799 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1800 {
1801         struct ipmr_mfc_iter *it = seq->private;
1802
1803         if (it->cache == &mfc_unres_queue)
1804                 spin_unlock_bh(&mfc_unres_lock);
1805         else if (it->cache == mfc_cache_array)
1806                 read_unlock(&mrt_lock);
1807 }
1808
1809 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1810 {
1811         int n;
1812
1813         if (v == SEQ_START_TOKEN) {
1814                 seq_puts(seq, 
1815                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1816         } else {
1817                 const struct mfc_cache *mfc = v;
1818                 const struct ipmr_mfc_iter *it = seq->private;
1819                 
1820                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1821                            (unsigned long) mfc->mfc_mcastgrp,
1822                            (unsigned long) mfc->mfc_origin,
1823                            mfc->mfc_parent,
1824                            mfc->mfc_un.res.pkt,
1825                            mfc->mfc_un.res.bytes,
1826                            mfc->mfc_un.res.wrong_if);
1827
1828                 if (it->cache != &mfc_unres_queue) {
1829                         for(n = mfc->mfc_un.res.minvif; 
1830                             n < mfc->mfc_un.res.maxvif; n++ ) {
1831                                 if(VIF_EXISTS(n) 
1832                                    && mfc->mfc_un.res.ttls[n] < 255)
1833                                 seq_printf(seq, 
1834                                            " %2d:%-3d", 
1835                                            n, mfc->mfc_un.res.ttls[n]);
1836                         }
1837                 }
1838                 seq_putc(seq, '\n');
1839         }
1840         return 0;
1841 }
1842
1843 static struct seq_operations ipmr_mfc_seq_ops = {
1844         .start = ipmr_mfc_seq_start,
1845         .next  = ipmr_mfc_seq_next,
1846         .stop  = ipmr_mfc_seq_stop,
1847         .show  = ipmr_mfc_seq_show,
1848 };
1849
1850 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1851 {
1852         struct seq_file *seq;
1853         int rc = -ENOMEM;
1854         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1855        
1856         if (!s)
1857                 goto out;
1858
1859         rc = seq_open(file, &ipmr_mfc_seq_ops);
1860         if (rc)
1861                 goto out_kfree;
1862
1863         seq = file->private_data;
1864         seq->private = s;
1865 out:
1866         return rc;
1867 out_kfree:
1868         kfree(s);
1869         goto out;
1870
1871 }
1872
1873 static struct file_operations ipmr_mfc_fops = {
1874         .owner   = THIS_MODULE,
1875         .open    = ipmr_mfc_open,
1876         .read    = seq_read,
1877         .llseek  = seq_lseek,
1878         .release = seq_release_private,
1879 };
1880 #endif  
1881
1882 #ifdef CONFIG_IP_PIMSM_V2
1883 static struct net_protocol pim_protocol = {
1884         .handler        =       pim_rcv,
1885 };
1886 #endif
1887
1888
1889 /*
1890  *      Setup for IP multicast routing
1891  */
1892  
1893 void __init ip_mr_init(void)
1894 {
1895         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1896                                        sizeof(struct mfc_cache),
1897                                        0, SLAB_HWCACHE_ALIGN,
1898                                        NULL, NULL);
1899         if (!mrt_cachep)
1900                 panic("cannot allocate ip_mrt_cache");
1901
1902         init_timer(&ipmr_expire_timer);
1903         ipmr_expire_timer.function=ipmr_expire_process;
1904         register_netdevice_notifier(&ip_mr_notifier);
1905 #ifdef CONFIG_PROC_FS   
1906         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1907         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1908 #endif  
1909 }