Merge master.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM 1
69 #endif
70
71 static struct sock *mroute_socket;
72
73
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77
78 static DEFINE_RWLOCK(mrt_lock);
79
80 /*
81  *      Multicast router control variables
82  */
83
84 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
85 static int maxvif;
86
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88
89 static int mroute_do_assert;                            /* Set in PIM assert    */
90 static int mroute_do_pim;
91
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
93
94 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
96
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104
105    In this case data path is free of exclusive locks at all.
106  */
107
108 static struct kmem_cache *mrt_cachep __read_mostly;
109
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117
118 static struct timer_list ipmr_expire_timer;
119
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125         struct net_device  *dev;
126
127         dev = __dev_get_by_name("tunl0");
128
129         if (dev) {
130                 int err;
131                 struct ifreq ifr;
132                 mm_segment_t    oldfs;
133                 struct ip_tunnel_parm p;
134                 struct in_device  *in_dev;
135
136                 memset(&p, 0, sizeof(p));
137                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
138                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139                 p.iph.version = 4;
140                 p.iph.ihl = 5;
141                 p.iph.protocol = IPPROTO_IPIP;
142                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143                 ifr.ifr_ifru.ifru_data = (void*)&p;
144
145                 oldfs = get_fs(); set_fs(KERNEL_DS);
146                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147                 set_fs(oldfs);
148
149                 dev = NULL;
150
151                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152                         dev->flags |= IFF_MULTICAST;
153
154                         in_dev = __in_dev_get_rtnl(dev);
155                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156                                 goto failure;
157                         in_dev->cnf.rp_filter = 0;
158
159                         if (dev_open(dev))
160                                 goto failure;
161                 }
162         }
163         return dev;
164
165 failure:
166         /* allow the register to be completed before unregistering. */
167         rtnl_unlock();
168         rtnl_lock();
169
170         unregister_netdevice(dev);
171         return NULL;
172 }
173
174 #ifdef CONFIG_IP_PIMSM
175
176 static int reg_vif_num = -1;
177
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180         read_lock(&mrt_lock);
181         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
182         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
183         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184         read_unlock(&mrt_lock);
185         kfree_skb(skb);
186         return 0;
187 }
188
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191         return (struct net_device_stats*)netdev_priv(dev);
192 }
193
194 static void reg_vif_setup(struct net_device *dev)
195 {
196         dev->type               = ARPHRD_PIMREG;
197         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198         dev->flags              = IFF_NOARP;
199         dev->hard_start_xmit    = reg_vif_xmit;
200         dev->get_stats          = reg_vif_get_stats;
201         dev->destructor         = free_netdev;
202 }
203
204 static struct net_device *ipmr_reg_vif(void)
205 {
206         struct net_device *dev;
207         struct in_device *in_dev;
208
209         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210                            reg_vif_setup);
211
212         if (dev == NULL)
213                 return NULL;
214
215         if (register_netdevice(dev)) {
216                 free_netdev(dev);
217                 return NULL;
218         }
219         dev->iflink = 0;
220
221         if ((in_dev = inetdev_init(dev)) == NULL)
222                 goto failure;
223
224         in_dev->cnf.rp_filter = 0;
225
226         if (dev_open(dev))
227                 goto failure;
228
229         return dev;
230
231 failure:
232         /* allow the register to be completed before unregistering. */
233         rtnl_unlock();
234         rtnl_lock();
235
236         unregister_netdevice(dev);
237         return NULL;
238 }
239 #endif
240
241 /*
242  *      Delete a VIF entry
243  */
244
245 static int vif_delete(int vifi)
246 {
247         struct vif_device *v;
248         struct net_device *dev;
249         struct in_device *in_dev;
250
251         if (vifi < 0 || vifi >= maxvif)
252                 return -EADDRNOTAVAIL;
253
254         v = &vif_table[vifi];
255
256         write_lock_bh(&mrt_lock);
257         dev = v->dev;
258         v->dev = NULL;
259
260         if (!dev) {
261                 write_unlock_bh(&mrt_lock);
262                 return -EADDRNOTAVAIL;
263         }
264
265 #ifdef CONFIG_IP_PIMSM
266         if (vifi == reg_vif_num)
267                 reg_vif_num = -1;
268 #endif
269
270         if (vifi+1 == maxvif) {
271                 int tmp;
272                 for (tmp=vifi-1; tmp>=0; tmp--) {
273                         if (VIF_EXISTS(tmp))
274                                 break;
275                 }
276                 maxvif = tmp+1;
277         }
278
279         write_unlock_bh(&mrt_lock);
280
281         dev_set_allmulti(dev, -1);
282
283         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284                 in_dev->cnf.mc_forwarding--;
285                 ip_rt_multicast_event(in_dev);
286         }
287
288         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289                 unregister_netdevice(dev);
290
291         dev_put(dev);
292         return 0;
293 }
294
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301         struct sk_buff *skb;
302         struct nlmsgerr *e;
303
304         atomic_dec(&cache_resolve_queue_len);
305
306         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307                 if (ip_hdr(skb)->version == 0) {
308                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309                         nlh->nlmsg_type = NLMSG_ERROR;
310                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311                         skb_trim(skb, nlh->nlmsg_len);
312                         e = NLMSG_DATA(nlh);
313                         e->error = -ETIMEDOUT;
314                         memset(&e->msg, 0, sizeof(e->msg));
315
316                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
317                 } else
318                         kfree_skb(skb);
319         }
320
321         kmem_cache_free(mrt_cachep, c);
322 }
323
324
325 /* Single timer process for all the unresolved queue. */
326
327 static void ipmr_expire_process(unsigned long dummy)
328 {
329         unsigned long now;
330         unsigned long expires;
331         struct mfc_cache *c, **cp;
332
333         if (!spin_trylock(&mfc_unres_lock)) {
334                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
335                 return;
336         }
337
338         if (atomic_read(&cache_resolve_queue_len) == 0)
339                 goto out;
340
341         now = jiffies;
342         expires = 10*HZ;
343         cp = &mfc_unres_queue;
344
345         while ((c=*cp) != NULL) {
346                 if (time_after(c->mfc_un.unres.expires, now)) {
347                         unsigned long interval = c->mfc_un.unres.expires - now;
348                         if (interval < expires)
349                                 expires = interval;
350                         cp = &c->next;
351                         continue;
352                 }
353
354                 *cp = c->next;
355
356                 ipmr_destroy_unres(c);
357         }
358
359         if (atomic_read(&cache_resolve_queue_len))
360                 mod_timer(&ipmr_expire_timer, jiffies + expires);
361
362 out:
363         spin_unlock(&mfc_unres_lock);
364 }
365
366 /* Fill oifs list. It is called under write locked mrt_lock. */
367
368 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
369 {
370         int vifi;
371
372         cache->mfc_un.res.minvif = MAXVIFS;
373         cache->mfc_un.res.maxvif = 0;
374         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
375
376         for (vifi=0; vifi<maxvif; vifi++) {
377                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
378                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
379                         if (cache->mfc_un.res.minvif > vifi)
380                                 cache->mfc_un.res.minvif = vifi;
381                         if (cache->mfc_un.res.maxvif <= vifi)
382                                 cache->mfc_un.res.maxvif = vifi + 1;
383                 }
384         }
385 }
386
387 static int vif_add(struct vifctl *vifc, int mrtsock)
388 {
389         int vifi = vifc->vifc_vifi;
390         struct vif_device *v = &vif_table[vifi];
391         struct net_device *dev;
392         struct in_device *in_dev;
393
394         /* Is vif busy ? */
395         if (VIF_EXISTS(vifi))
396                 return -EADDRINUSE;
397
398         switch (vifc->vifc_flags) {
399 #ifdef CONFIG_IP_PIMSM
400         case VIFF_REGISTER:
401                 /*
402                  * Special Purpose VIF in PIM
403                  * All the packets will be sent to the daemon
404                  */
405                 if (reg_vif_num >= 0)
406                         return -EADDRINUSE;
407                 dev = ipmr_reg_vif();
408                 if (!dev)
409                         return -ENOBUFS;
410                 break;
411 #endif
412         case VIFF_TUNNEL:
413                 dev = ipmr_new_tunnel(vifc);
414                 if (!dev)
415                         return -ENOBUFS;
416                 break;
417         case 0:
418                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
419                 if (!dev)
420                         return -EADDRNOTAVAIL;
421                 dev_put(dev);
422                 break;
423         default:
424                 return -EINVAL;
425         }
426
427         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
428                 return -EADDRNOTAVAIL;
429         in_dev->cnf.mc_forwarding++;
430         dev_set_allmulti(dev, +1);
431         ip_rt_multicast_event(in_dev);
432
433         /*
434          *      Fill in the VIF structures
435          */
436         v->rate_limit=vifc->vifc_rate_limit;
437         v->local=vifc->vifc_lcl_addr.s_addr;
438         v->remote=vifc->vifc_rmt_addr.s_addr;
439         v->flags=vifc->vifc_flags;
440         if (!mrtsock)
441                 v->flags |= VIFF_STATIC;
442         v->threshold=vifc->vifc_threshold;
443         v->bytes_in = 0;
444         v->bytes_out = 0;
445         v->pkt_in = 0;
446         v->pkt_out = 0;
447         v->link = dev->ifindex;
448         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
449                 v->link = dev->iflink;
450
451         /* And finish update writing critical data */
452         write_lock_bh(&mrt_lock);
453         dev_hold(dev);
454         v->dev=dev;
455 #ifdef CONFIG_IP_PIMSM
456         if (v->flags&VIFF_REGISTER)
457                 reg_vif_num = vifi;
458 #endif
459         if (vifi+1 > maxvif)
460                 maxvif = vifi+1;
461         write_unlock_bh(&mrt_lock);
462         return 0;
463 }
464
465 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
466 {
467         int line=MFC_HASH(mcastgrp,origin);
468         struct mfc_cache *c;
469
470         for (c=mfc_cache_array[line]; c; c = c->next) {
471                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472                         break;
473         }
474         return c;
475 }
476
477 /*
478  *      Allocate a multicast cache entry
479  */
480 static struct mfc_cache *ipmr_cache_alloc(void)
481 {
482         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
483         if (c==NULL)
484                 return NULL;
485         c->mfc_un.res.minvif = MAXVIFS;
486         return c;
487 }
488
489 static struct mfc_cache *ipmr_cache_alloc_unres(void)
490 {
491         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
492         if (c==NULL)
493                 return NULL;
494         skb_queue_head_init(&c->mfc_un.unres.unresolved);
495         c->mfc_un.unres.expires = jiffies + 10*HZ;
496         return c;
497 }
498
499 /*
500  *      A cache entry has gone into a resolved state from queued
501  */
502
503 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
504 {
505         struct sk_buff *skb;
506         struct nlmsgerr *e;
507
508         /*
509          *      Play the pending entries through our router
510          */
511
512         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
513                 if (ip_hdr(skb)->version == 0) {
514                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
515
516                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
517                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
518                                                   (u8 *)nlh);
519                         } else {
520                                 nlh->nlmsg_type = NLMSG_ERROR;
521                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
522                                 skb_trim(skb, nlh->nlmsg_len);
523                                 e = NLMSG_DATA(nlh);
524                                 e->error = -EMSGSIZE;
525                                 memset(&e->msg, 0, sizeof(e->msg));
526                         }
527
528                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
529                 } else
530                         ip_mr_forward(skb, c, 0);
531         }
532 }
533
534 /*
535  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
536  *      expects the following bizarre scheme.
537  *
538  *      Called under mrt_lock.
539  */
540
541 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
542 {
543         struct sk_buff *skb;
544         const int ihl = ip_hdrlen(pkt);
545         struct igmphdr *igmp;
546         struct igmpmsg *msg;
547         int ret;
548
549 #ifdef CONFIG_IP_PIMSM
550         if (assert == IGMPMSG_WHOLEPKT)
551                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
552         else
553 #endif
554                 skb = alloc_skb(128, GFP_ATOMIC);
555
556         if (!skb)
557                 return -ENOBUFS;
558
559 #ifdef CONFIG_IP_PIMSM
560         if (assert == IGMPMSG_WHOLEPKT) {
561                 /* Ugly, but we have no choice with this interface.
562                    Duplicate old header, fix ihl, length etc.
563                    And all this only to mangle msg->im_msgtype and
564                    to set msg->im_mbz to "mbz" :-)
565                  */
566                 skb_push(skb, sizeof(struct iphdr));
567                 skb_reset_network_header(skb);
568                 skb_reset_transport_header(skb);
569                 msg = (struct igmpmsg *)skb_network_header(skb);
570                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
571                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
572                 msg->im_mbz = 0;
573                 msg->im_vif = reg_vif_num;
574                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
575                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
576                                              sizeof(struct iphdr));
577         } else
578 #endif
579         {
580
581         /*
582          *      Copy the IP header
583          */
584
585         skb->network_header = skb->tail;
586         skb_put(skb, ihl);
587         skb_copy_to_linear_data(skb, pkt->data, ihl);
588         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
589         msg = (struct igmpmsg *)skb_network_header(skb);
590         msg->im_vif = vifi;
591         skb->dst = dst_clone(pkt->dst);
592
593         /*
594          *      Add our header
595          */
596
597         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
598         igmp->type      =
599         msg->im_msgtype = assert;
600         igmp->code      =       0;
601         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
602         skb->transport_header = skb->network_header;
603         }
604
605         if (mroute_socket == NULL) {
606                 kfree_skb(skb);
607                 return -EINVAL;
608         }
609
610         /*
611          *      Deliver to mrouted
612          */
613         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
614                 if (net_ratelimit())
615                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
616                 kfree_skb(skb);
617         }
618
619         return ret;
620 }
621
622 /*
623  *      Queue a packet for resolution. It gets locked cache entry!
624  */
625
626 static int
627 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
628 {
629         int err;
630         struct mfc_cache *c;
631         const struct iphdr *iph = ip_hdr(skb);
632
633         spin_lock_bh(&mfc_unres_lock);
634         for (c=mfc_unres_queue; c; c=c->next) {
635                 if (c->mfc_mcastgrp == iph->daddr &&
636                     c->mfc_origin == iph->saddr)
637                         break;
638         }
639
640         if (c == NULL) {
641                 /*
642                  *      Create a new entry if allowable
643                  */
644
645                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
646                     (c=ipmr_cache_alloc_unres())==NULL) {
647                         spin_unlock_bh(&mfc_unres_lock);
648
649                         kfree_skb(skb);
650                         return -ENOBUFS;
651                 }
652
653                 /*
654                  *      Fill in the new cache entry
655                  */
656                 c->mfc_parent   = -1;
657                 c->mfc_origin   = iph->saddr;
658                 c->mfc_mcastgrp = iph->daddr;
659
660                 /*
661                  *      Reflect first query at mrouted.
662                  */
663                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
664                         /* If the report failed throw the cache entry
665                            out - Brad Parker
666                          */
667                         spin_unlock_bh(&mfc_unres_lock);
668
669                         kmem_cache_free(mrt_cachep, c);
670                         kfree_skb(skb);
671                         return err;
672                 }
673
674                 atomic_inc(&cache_resolve_queue_len);
675                 c->next = mfc_unres_queue;
676                 mfc_unres_queue = c;
677
678                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
679         }
680
681         /*
682          *      See if we can append the packet
683          */
684         if (c->mfc_un.unres.unresolved.qlen>3) {
685                 kfree_skb(skb);
686                 err = -ENOBUFS;
687         } else {
688                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
689                 err = 0;
690         }
691
692         spin_unlock_bh(&mfc_unres_lock);
693         return err;
694 }
695
696 /*
697  *      MFC cache manipulation by user space mroute daemon
698  */
699
700 static int ipmr_mfc_delete(struct mfcctl *mfc)
701 {
702         int line;
703         struct mfc_cache *c, **cp;
704
705         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
706
707         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
708                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
709                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
710                         write_lock_bh(&mrt_lock);
711                         *cp = c->next;
712                         write_unlock_bh(&mrt_lock);
713
714                         kmem_cache_free(mrt_cachep, c);
715                         return 0;
716                 }
717         }
718         return -ENOENT;
719 }
720
721 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
722 {
723         int line;
724         struct mfc_cache *uc, *c, **cp;
725
726         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
727
728         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
729                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
730                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
731                         break;
732         }
733
734         if (c != NULL) {
735                 write_lock_bh(&mrt_lock);
736                 c->mfc_parent = mfc->mfcc_parent;
737                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
738                 if (!mrtsock)
739                         c->mfc_flags |= MFC_STATIC;
740                 write_unlock_bh(&mrt_lock);
741                 return 0;
742         }
743
744         if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
745                 return -EINVAL;
746
747         c=ipmr_cache_alloc();
748         if (c==NULL)
749                 return -ENOMEM;
750
751         c->mfc_origin=mfc->mfcc_origin.s_addr;
752         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
753         c->mfc_parent=mfc->mfcc_parent;
754         ipmr_update_thresholds(c, mfc->mfcc_ttls);
755         if (!mrtsock)
756                 c->mfc_flags |= MFC_STATIC;
757
758         write_lock_bh(&mrt_lock);
759         c->next = mfc_cache_array[line];
760         mfc_cache_array[line] = c;
761         write_unlock_bh(&mrt_lock);
762
763         /*
764          *      Check to see if we resolved a queued list. If so we
765          *      need to send on the frames and tidy up.
766          */
767         spin_lock_bh(&mfc_unres_lock);
768         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
769              cp = &uc->next) {
770                 if (uc->mfc_origin == c->mfc_origin &&
771                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
772                         *cp = uc->next;
773                         if (atomic_dec_and_test(&cache_resolve_queue_len))
774                                 del_timer(&ipmr_expire_timer);
775                         break;
776                 }
777         }
778         spin_unlock_bh(&mfc_unres_lock);
779
780         if (uc) {
781                 ipmr_cache_resolve(uc, c);
782                 kmem_cache_free(mrt_cachep, uc);
783         }
784         return 0;
785 }
786
787 /*
788  *      Close the multicast socket, and clear the vif tables etc
789  */
790
791 static void mroute_clean_tables(struct sock *sk)
792 {
793         int i;
794
795         /*
796          *      Shut down all active vif entries
797          */
798         for (i=0; i<maxvif; i++) {
799                 if (!(vif_table[i].flags&VIFF_STATIC))
800                         vif_delete(i);
801         }
802
803         /*
804          *      Wipe the cache
805          */
806         for (i=0;i<MFC_LINES;i++) {
807                 struct mfc_cache *c, **cp;
808
809                 cp = &mfc_cache_array[i];
810                 while ((c = *cp) != NULL) {
811                         if (c->mfc_flags&MFC_STATIC) {
812                                 cp = &c->next;
813                                 continue;
814                         }
815                         write_lock_bh(&mrt_lock);
816                         *cp = c->next;
817                         write_unlock_bh(&mrt_lock);
818
819                         kmem_cache_free(mrt_cachep, c);
820                 }
821         }
822
823         if (atomic_read(&cache_resolve_queue_len) != 0) {
824                 struct mfc_cache *c;
825
826                 spin_lock_bh(&mfc_unres_lock);
827                 while (mfc_unres_queue != NULL) {
828                         c = mfc_unres_queue;
829                         mfc_unres_queue = c->next;
830                         spin_unlock_bh(&mfc_unres_lock);
831
832                         ipmr_destroy_unres(c);
833
834                         spin_lock_bh(&mfc_unres_lock);
835                 }
836                 spin_unlock_bh(&mfc_unres_lock);
837         }
838 }
839
840 static void mrtsock_destruct(struct sock *sk)
841 {
842         rtnl_lock();
843         if (sk == mroute_socket) {
844                 ipv4_devconf.mc_forwarding--;
845
846                 write_lock_bh(&mrt_lock);
847                 mroute_socket=NULL;
848                 write_unlock_bh(&mrt_lock);
849
850                 mroute_clean_tables(sk);
851         }
852         rtnl_unlock();
853 }
854
855 /*
856  *      Socket options and virtual interface manipulation. The whole
857  *      virtual interface system is a complete heap, but unfortunately
858  *      that's how BSD mrouted happens to think. Maybe one day with a proper
859  *      MOSPF/PIM router set up we can clean this up.
860  */
861
862 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
863 {
864         int ret;
865         struct vifctl vif;
866         struct mfcctl mfc;
867
868         if (optname != MRT_INIT) {
869                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
870                         return -EACCES;
871         }
872
873         switch (optname) {
874         case MRT_INIT:
875                 if (sk->sk_type != SOCK_RAW ||
876                     inet_sk(sk)->num != IPPROTO_IGMP)
877                         return -EOPNOTSUPP;
878                 if (optlen!=sizeof(int))
879                         return -ENOPROTOOPT;
880
881                 rtnl_lock();
882                 if (mroute_socket) {
883                         rtnl_unlock();
884                         return -EADDRINUSE;
885                 }
886
887                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
888                 if (ret == 0) {
889                         write_lock_bh(&mrt_lock);
890                         mroute_socket=sk;
891                         write_unlock_bh(&mrt_lock);
892
893                         ipv4_devconf.mc_forwarding++;
894                 }
895                 rtnl_unlock();
896                 return ret;
897         case MRT_DONE:
898                 if (sk!=mroute_socket)
899                         return -EACCES;
900                 return ip_ra_control(sk, 0, NULL);
901         case MRT_ADD_VIF:
902         case MRT_DEL_VIF:
903                 if (optlen!=sizeof(vif))
904                         return -EINVAL;
905                 if (copy_from_user(&vif,optval,sizeof(vif)))
906                         return -EFAULT;
907                 if (vif.vifc_vifi >= MAXVIFS)
908                         return -ENFILE;
909                 rtnl_lock();
910                 if (optname==MRT_ADD_VIF) {
911                         ret = vif_add(&vif, sk==mroute_socket);
912                 } else {
913                         ret = vif_delete(vif.vifc_vifi);
914                 }
915                 rtnl_unlock();
916                 return ret;
917
918                 /*
919                  *      Manipulate the forwarding caches. These live
920                  *      in a sort of kernel/user symbiosis.
921                  */
922         case MRT_ADD_MFC:
923         case MRT_DEL_MFC:
924                 if (optlen!=sizeof(mfc))
925                         return -EINVAL;
926                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
927                         return -EFAULT;
928                 rtnl_lock();
929                 if (optname==MRT_DEL_MFC)
930                         ret = ipmr_mfc_delete(&mfc);
931                 else
932                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
933                 rtnl_unlock();
934                 return ret;
935                 /*
936                  *      Control PIM assert.
937                  */
938         case MRT_ASSERT:
939         {
940                 int v;
941                 if (get_user(v,(int __user *)optval))
942                         return -EFAULT;
943                 mroute_do_assert=(v)?1:0;
944                 return 0;
945         }
946 #ifdef CONFIG_IP_PIMSM
947         case MRT_PIM:
948         {
949                 int v, ret;
950                 if (get_user(v,(int __user *)optval))
951                         return -EFAULT;
952                 v = (v)?1:0;
953                 rtnl_lock();
954                 ret = 0;
955                 if (v != mroute_do_pim) {
956                         mroute_do_pim = v;
957                         mroute_do_assert = v;
958 #ifdef CONFIG_IP_PIMSM_V2
959                         if (mroute_do_pim)
960                                 ret = inet_add_protocol(&pim_protocol,
961                                                         IPPROTO_PIM);
962                         else
963                                 ret = inet_del_protocol(&pim_protocol,
964                                                         IPPROTO_PIM);
965                         if (ret < 0)
966                                 ret = -EAGAIN;
967 #endif
968                 }
969                 rtnl_unlock();
970                 return ret;
971         }
972 #endif
973         /*
974          *      Spurious command, or MRT_VERSION which you cannot
975          *      set.
976          */
977         default:
978                 return -ENOPROTOOPT;
979         }
980 }
981
982 /*
983  *      Getsock opt support for the multicast routing system.
984  */
985
986 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
987 {
988         int olr;
989         int val;
990
991         if (optname!=MRT_VERSION &&
992 #ifdef CONFIG_IP_PIMSM
993            optname!=MRT_PIM &&
994 #endif
995            optname!=MRT_ASSERT)
996                 return -ENOPROTOOPT;
997
998         if (get_user(olr, optlen))
999                 return -EFAULT;
1000
1001         olr = min_t(unsigned int, olr, sizeof(int));
1002         if (olr < 0)
1003                 return -EINVAL;
1004
1005         if (put_user(olr,optlen))
1006                 return -EFAULT;
1007         if (optname==MRT_VERSION)
1008                 val=0x0305;
1009 #ifdef CONFIG_IP_PIMSM
1010         else if (optname==MRT_PIM)
1011                 val=mroute_do_pim;
1012 #endif
1013         else
1014                 val=mroute_do_assert;
1015         if (copy_to_user(optval,&val,olr))
1016                 return -EFAULT;
1017         return 0;
1018 }
1019
1020 /*
1021  *      The IP multicast ioctl support routines.
1022  */
1023
1024 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1025 {
1026         struct sioc_sg_req sr;
1027         struct sioc_vif_req vr;
1028         struct vif_device *vif;
1029         struct mfc_cache *c;
1030
1031         switch (cmd) {
1032         case SIOCGETVIFCNT:
1033                 if (copy_from_user(&vr,arg,sizeof(vr)))
1034                         return -EFAULT;
1035                 if (vr.vifi>=maxvif)
1036                         return -EINVAL;
1037                 read_lock(&mrt_lock);
1038                 vif=&vif_table[vr.vifi];
1039                 if (VIF_EXISTS(vr.vifi))        {
1040                         vr.icount=vif->pkt_in;
1041                         vr.ocount=vif->pkt_out;
1042                         vr.ibytes=vif->bytes_in;
1043                         vr.obytes=vif->bytes_out;
1044                         read_unlock(&mrt_lock);
1045
1046                         if (copy_to_user(arg,&vr,sizeof(vr)))
1047                                 return -EFAULT;
1048                         return 0;
1049                 }
1050                 read_unlock(&mrt_lock);
1051                 return -EADDRNOTAVAIL;
1052         case SIOCGETSGCNT:
1053                 if (copy_from_user(&sr,arg,sizeof(sr)))
1054                         return -EFAULT;
1055
1056                 read_lock(&mrt_lock);
1057                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1058                 if (c) {
1059                         sr.pktcnt = c->mfc_un.res.pkt;
1060                         sr.bytecnt = c->mfc_un.res.bytes;
1061                         sr.wrong_if = c->mfc_un.res.wrong_if;
1062                         read_unlock(&mrt_lock);
1063
1064                         if (copy_to_user(arg,&sr,sizeof(sr)))
1065                                 return -EFAULT;
1066                         return 0;
1067                 }
1068                 read_unlock(&mrt_lock);
1069                 return -EADDRNOTAVAIL;
1070         default:
1071                 return -ENOIOCTLCMD;
1072         }
1073 }
1074
1075
1076 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1077 {
1078         struct vif_device *v;
1079         int ct;
1080         if (event != NETDEV_UNREGISTER)
1081                 return NOTIFY_DONE;
1082         v=&vif_table[0];
1083         for (ct=0;ct<maxvif;ct++,v++) {
1084                 if (v->dev==ptr)
1085                         vif_delete(ct);
1086         }
1087         return NOTIFY_DONE;
1088 }
1089
1090
1091 static struct notifier_block ip_mr_notifier={
1092         .notifier_call = ipmr_device_event,
1093 };
1094
1095 /*
1096  *      Encapsulate a packet by attaching a valid IPIP header to it.
1097  *      This avoids tunnel drivers and other mess and gives us the speed so
1098  *      important for multicast video.
1099  */
1100
1101 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1102 {
1103         struct iphdr *iph;
1104         struct iphdr *old_iph = ip_hdr(skb);
1105
1106         skb_push(skb, sizeof(struct iphdr));
1107         skb->transport_header = skb->network_header;
1108         skb_reset_network_header(skb);
1109         iph = ip_hdr(skb);
1110
1111         iph->version    =       4;
1112         iph->tos        =       old_iph->tos;
1113         iph->ttl        =       old_iph->ttl;
1114         iph->frag_off   =       0;
1115         iph->daddr      =       daddr;
1116         iph->saddr      =       saddr;
1117         iph->protocol   =       IPPROTO_IPIP;
1118         iph->ihl        =       5;
1119         iph->tot_len    =       htons(skb->len);
1120         ip_select_ident(iph, skb->dst, NULL);
1121         ip_send_check(iph);
1122
1123         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1124         nf_reset(skb);
1125 }
1126
1127 static inline int ipmr_forward_finish(struct sk_buff *skb)
1128 {
1129         struct ip_options * opt = &(IPCB(skb)->opt);
1130
1131         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1132
1133         if (unlikely(opt->optlen))
1134                 ip_forward_options(skb);
1135
1136         return dst_output(skb);
1137 }
1138
1139 /*
1140  *      Processing handlers for ipmr_forward
1141  */
1142
1143 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1144 {
1145         const struct iphdr *iph = ip_hdr(skb);
1146         struct vif_device *vif = &vif_table[vifi];
1147         struct net_device *dev;
1148         struct rtable *rt;
1149         int    encap = 0;
1150
1151         if (vif->dev == NULL)
1152                 goto out_free;
1153
1154 #ifdef CONFIG_IP_PIMSM
1155         if (vif->flags & VIFF_REGISTER) {
1156                 vif->pkt_out++;
1157                 vif->bytes_out+=skb->len;
1158                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1159                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1160                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1161                 kfree_skb(skb);
1162                 return;
1163         }
1164 #endif
1165
1166         if (vif->flags&VIFF_TUNNEL) {
1167                 struct flowi fl = { .oif = vif->link,
1168                                     .nl_u = { .ip4_u =
1169                                               { .daddr = vif->remote,
1170                                                 .saddr = vif->local,
1171                                                 .tos = RT_TOS(iph->tos) } },
1172                                     .proto = IPPROTO_IPIP };
1173                 if (ip_route_output_key(&rt, &fl))
1174                         goto out_free;
1175                 encap = sizeof(struct iphdr);
1176         } else {
1177                 struct flowi fl = { .oif = vif->link,
1178                                     .nl_u = { .ip4_u =
1179                                               { .daddr = iph->daddr,
1180                                                 .tos = RT_TOS(iph->tos) } },
1181                                     .proto = IPPROTO_IPIP };
1182                 if (ip_route_output_key(&rt, &fl))
1183                         goto out_free;
1184         }
1185
1186         dev = rt->u.dst.dev;
1187
1188         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1189                 /* Do not fragment multicasts. Alas, IPv4 does not
1190                    allow to send ICMP, so that packets will disappear
1191                    to blackhole.
1192                  */
1193
1194                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1195                 ip_rt_put(rt);
1196                 goto out_free;
1197         }
1198
1199         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1200
1201         if (skb_cow(skb, encap)) {
1202                 ip_rt_put(rt);
1203                 goto out_free;
1204         }
1205
1206         vif->pkt_out++;
1207         vif->bytes_out+=skb->len;
1208
1209         dst_release(skb->dst);
1210         skb->dst = &rt->u.dst;
1211         ip_decrease_ttl(ip_hdr(skb));
1212
1213         /* FIXME: forward and output firewalls used to be called here.
1214          * What do we do with netfilter? -- RR */
1215         if (vif->flags & VIFF_TUNNEL) {
1216                 ip_encap(skb, vif->local, vif->remote);
1217                 /* FIXME: extra output firewall step used to be here. --RR */
1218                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1219                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1220         }
1221
1222         IPCB(skb)->flags |= IPSKB_FORWARDED;
1223
1224         /*
1225          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1226          * not only before forwarding, but after forwarding on all output
1227          * interfaces. It is clear, if mrouter runs a multicasting
1228          * program, it should receive packets not depending to what interface
1229          * program is joined.
1230          * If we will not make it, the program will have to join on all
1231          * interfaces. On the other hand, multihoming host (or router, but
1232          * not mrouter) cannot join to more than one interface - it will
1233          * result in receiving multiple packets.
1234          */
1235         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1236                 ipmr_forward_finish);
1237         return;
1238
1239 out_free:
1240         kfree_skb(skb);
1241         return;
1242 }
1243
1244 static int ipmr_find_vif(struct net_device *dev)
1245 {
1246         int ct;
1247         for (ct=maxvif-1; ct>=0; ct--) {
1248                 if (vif_table[ct].dev == dev)
1249                         break;
1250         }
1251         return ct;
1252 }
1253
1254 /* "local" means that we should preserve one skb (for local delivery) */
1255
1256 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1257 {
1258         int psend = -1;
1259         int vif, ct;
1260
1261         vif = cache->mfc_parent;
1262         cache->mfc_un.res.pkt++;
1263         cache->mfc_un.res.bytes += skb->len;
1264
1265         /*
1266          * Wrong interface: drop packet and (maybe) send PIM assert.
1267          */
1268         if (vif_table[vif].dev != skb->dev) {
1269                 int true_vifi;
1270
1271                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1272                         /* It is our own packet, looped back.
1273                            Very complicated situation...
1274
1275                            The best workaround until routing daemons will be
1276                            fixed is not to redistribute packet, if it was
1277                            send through wrong interface. It means, that
1278                            multicast applications WILL NOT work for
1279                            (S,G), which have default multicast route pointing
1280                            to wrong oif. In any case, it is not a good
1281                            idea to use multicasting applications on router.
1282                          */
1283                         goto dont_forward;
1284                 }
1285
1286                 cache->mfc_un.res.wrong_if++;
1287                 true_vifi = ipmr_find_vif(skb->dev);
1288
1289                 if (true_vifi >= 0 && mroute_do_assert &&
1290                     /* pimsm uses asserts, when switching from RPT to SPT,
1291                        so that we cannot check that packet arrived on an oif.
1292                        It is bad, but otherwise we would need to move pretty
1293                        large chunk of pimd to kernel. Ough... --ANK
1294                      */
1295                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1296                     time_after(jiffies,
1297                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1298                         cache->mfc_un.res.last_assert = jiffies;
1299                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1300                 }
1301                 goto dont_forward;
1302         }
1303
1304         vif_table[vif].pkt_in++;
1305         vif_table[vif].bytes_in+=skb->len;
1306
1307         /*
1308          *      Forward the frame
1309          */
1310         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1311                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1312                         if (psend != -1) {
1313                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1314                                 if (skb2)
1315                                         ipmr_queue_xmit(skb2, cache, psend);
1316                         }
1317                         psend=ct;
1318                 }
1319         }
1320         if (psend != -1) {
1321                 if (local) {
1322                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1323                         if (skb2)
1324                                 ipmr_queue_xmit(skb2, cache, psend);
1325                 } else {
1326                         ipmr_queue_xmit(skb, cache, psend);
1327                         return 0;
1328                 }
1329         }
1330
1331 dont_forward:
1332         if (!local)
1333                 kfree_skb(skb);
1334         return 0;
1335 }
1336
1337
1338 /*
1339  *      Multicast packets for forwarding arrive here
1340  */
1341
1342 int ip_mr_input(struct sk_buff *skb)
1343 {
1344         struct mfc_cache *cache;
1345         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1346
1347         /* Packet is looped back after forward, it should not be
1348            forwarded second time, but still can be delivered locally.
1349          */
1350         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1351                 goto dont_forward;
1352
1353         if (!local) {
1354                     if (IPCB(skb)->opt.router_alert) {
1355                             if (ip_call_ra_chain(skb))
1356                                     return 0;
1357                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1358                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1359                                Cisco IOS <= 11.2(8)) do not put router alert
1360                                option to IGMP packets destined to routable
1361                                groups. It is very bad, because it means
1362                                that we can forward NO IGMP messages.
1363                              */
1364                             read_lock(&mrt_lock);
1365                             if (mroute_socket) {
1366                                     nf_reset(skb);
1367                                     raw_rcv(mroute_socket, skb);
1368                                     read_unlock(&mrt_lock);
1369                                     return 0;
1370                             }
1371                             read_unlock(&mrt_lock);
1372                     }
1373         }
1374
1375         read_lock(&mrt_lock);
1376         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1377
1378         /*
1379          *      No usable cache entry
1380          */
1381         if (cache==NULL) {
1382                 int vif;
1383
1384                 if (local) {
1385                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1386                         ip_local_deliver(skb);
1387                         if (skb2 == NULL) {
1388                                 read_unlock(&mrt_lock);
1389                                 return -ENOBUFS;
1390                         }
1391                         skb = skb2;
1392                 }
1393
1394                 vif = ipmr_find_vif(skb->dev);
1395                 if (vif >= 0) {
1396                         int err = ipmr_cache_unresolved(vif, skb);
1397                         read_unlock(&mrt_lock);
1398
1399                         return err;
1400                 }
1401                 read_unlock(&mrt_lock);
1402                 kfree_skb(skb);
1403                 return -ENODEV;
1404         }
1405
1406         ip_mr_forward(skb, cache, local);
1407
1408         read_unlock(&mrt_lock);
1409
1410         if (local)
1411                 return ip_local_deliver(skb);
1412
1413         return 0;
1414
1415 dont_forward:
1416         if (local)
1417                 return ip_local_deliver(skb);
1418         kfree_skb(skb);
1419         return 0;
1420 }
1421
1422 #ifdef CONFIG_IP_PIMSM_V1
1423 /*
1424  * Handle IGMP messages of PIMv1
1425  */
1426
1427 int pim_rcv_v1(struct sk_buff * skb)
1428 {
1429         struct igmphdr *pim;
1430         struct iphdr   *encap;
1431         struct net_device  *reg_dev = NULL;
1432
1433         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1434                 goto drop;
1435
1436         pim = igmp_hdr(skb);
1437
1438         if (!mroute_do_pim ||
1439             skb->len < sizeof(*pim) + sizeof(*encap) ||
1440             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1441                 goto drop;
1442
1443         encap = (struct iphdr *)(skb_transport_header(skb) +
1444                                  sizeof(struct igmphdr));
1445         /*
1446            Check that:
1447            a. packet is really destinted to a multicast group
1448            b. packet is not a NULL-REGISTER
1449            c. packet is not truncated
1450          */
1451         if (!MULTICAST(encap->daddr) ||
1452             encap->tot_len == 0 ||
1453             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1454                 goto drop;
1455
1456         read_lock(&mrt_lock);
1457         if (reg_vif_num >= 0)
1458                 reg_dev = vif_table[reg_vif_num].dev;
1459         if (reg_dev)
1460                 dev_hold(reg_dev);
1461         read_unlock(&mrt_lock);
1462
1463         if (reg_dev == NULL)
1464                 goto drop;
1465
1466         skb->mac_header = skb->network_header;
1467         skb_pull(skb, (u8*)encap - skb->data);
1468         skb_reset_network_header(skb);
1469         skb->dev = reg_dev;
1470         skb->protocol = htons(ETH_P_IP);
1471         skb->ip_summed = 0;
1472         skb->pkt_type = PACKET_HOST;
1473         dst_release(skb->dst);
1474         skb->dst = NULL;
1475         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1476         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1477         nf_reset(skb);
1478         netif_rx(skb);
1479         dev_put(reg_dev);
1480         return 0;
1481  drop:
1482         kfree_skb(skb);
1483         return 0;
1484 }
1485 #endif
1486
1487 #ifdef CONFIG_IP_PIMSM_V2
1488 static int pim_rcv(struct sk_buff * skb)
1489 {
1490         struct pimreghdr *pim;
1491         struct iphdr   *encap;
1492         struct net_device  *reg_dev = NULL;
1493
1494         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1495                 goto drop;
1496
1497         pim = (struct pimreghdr *)skb_transport_header(skb);
1498         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1499             (pim->flags&PIM_NULL_REGISTER) ||
1500             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1501              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1502                 goto drop;
1503
1504         /* check if the inner packet is destined to mcast group */
1505         encap = (struct iphdr *)(skb_transport_header(skb) +
1506                                  sizeof(struct pimreghdr));
1507         if (!MULTICAST(encap->daddr) ||
1508             encap->tot_len == 0 ||
1509             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1510                 goto drop;
1511
1512         read_lock(&mrt_lock);
1513         if (reg_vif_num >= 0)
1514                 reg_dev = vif_table[reg_vif_num].dev;
1515         if (reg_dev)
1516                 dev_hold(reg_dev);
1517         read_unlock(&mrt_lock);
1518
1519         if (reg_dev == NULL)
1520                 goto drop;
1521
1522         skb->mac_header = skb->network_header;
1523         skb_pull(skb, (u8*)encap - skb->data);
1524         skb_reset_network_header(skb);
1525         skb->dev = reg_dev;
1526         skb->protocol = htons(ETH_P_IP);
1527         skb->ip_summed = 0;
1528         skb->pkt_type = PACKET_HOST;
1529         dst_release(skb->dst);
1530         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1531         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1532         skb->dst = NULL;
1533         nf_reset(skb);
1534         netif_rx(skb);
1535         dev_put(reg_dev);
1536         return 0;
1537  drop:
1538         kfree_skb(skb);
1539         return 0;
1540 }
1541 #endif
1542
1543 static int
1544 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1545 {
1546         int ct;
1547         struct rtnexthop *nhp;
1548         struct net_device *dev = vif_table[c->mfc_parent].dev;
1549         u8 *b = skb_tail_pointer(skb);
1550         struct rtattr *mp_head;
1551
1552         if (dev)
1553                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1554
1555         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1556
1557         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1558                 if (c->mfc_un.res.ttls[ct] < 255) {
1559                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1560                                 goto rtattr_failure;
1561                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1562                         nhp->rtnh_flags = 0;
1563                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1564                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1565                         nhp->rtnh_len = sizeof(*nhp);
1566                 }
1567         }
1568         mp_head->rta_type = RTA_MULTIPATH;
1569         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1570         rtm->rtm_type = RTN_MULTICAST;
1571         return 1;
1572
1573 rtattr_failure:
1574         nlmsg_trim(skb, b);
1575         return -EMSGSIZE;
1576 }
1577
1578 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1579 {
1580         int err;
1581         struct mfc_cache *cache;
1582         struct rtable *rt = (struct rtable*)skb->dst;
1583
1584         read_lock(&mrt_lock);
1585         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1586
1587         if (cache==NULL) {
1588                 struct sk_buff *skb2;
1589                 struct iphdr *iph;
1590                 struct net_device *dev;
1591                 int vif;
1592
1593                 if (nowait) {
1594                         read_unlock(&mrt_lock);
1595                         return -EAGAIN;
1596                 }
1597
1598                 dev = skb->dev;
1599                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1600                         read_unlock(&mrt_lock);
1601                         return -ENODEV;
1602                 }
1603                 skb2 = skb_clone(skb, GFP_ATOMIC);
1604                 if (!skb2) {
1605                         read_unlock(&mrt_lock);
1606                         return -ENOMEM;
1607                 }
1608
1609                 skb_push(skb2, sizeof(struct iphdr));
1610                 skb_reset_network_header(skb2);
1611                 iph = ip_hdr(skb2);
1612                 iph->ihl = sizeof(struct iphdr) >> 2;
1613                 iph->saddr = rt->rt_src;
1614                 iph->daddr = rt->rt_dst;
1615                 iph->version = 0;
1616                 err = ipmr_cache_unresolved(vif, skb2);
1617                 read_unlock(&mrt_lock);
1618                 return err;
1619         }
1620
1621         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1622                 cache->mfc_flags |= MFC_NOTIFY;
1623         err = ipmr_fill_mroute(skb, cache, rtm);
1624         read_unlock(&mrt_lock);
1625         return err;
1626 }
1627
1628 #ifdef CONFIG_PROC_FS
1629 /*
1630  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1631  */
1632 struct ipmr_vif_iter {
1633         int ct;
1634 };
1635
1636 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1637                                            loff_t pos)
1638 {
1639         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1640                 if (!VIF_EXISTS(iter->ct))
1641                         continue;
1642                 if (pos-- == 0)
1643                         return &vif_table[iter->ct];
1644         }
1645         return NULL;
1646 }
1647
1648 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1649 {
1650         read_lock(&mrt_lock);
1651         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1652                 : SEQ_START_TOKEN;
1653 }
1654
1655 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1656 {
1657         struct ipmr_vif_iter *iter = seq->private;
1658
1659         ++*pos;
1660         if (v == SEQ_START_TOKEN)
1661                 return ipmr_vif_seq_idx(iter, 0);
1662
1663         while (++iter->ct < maxvif) {
1664                 if (!VIF_EXISTS(iter->ct))
1665                         continue;
1666                 return &vif_table[iter->ct];
1667         }
1668         return NULL;
1669 }
1670
1671 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1672 {
1673         read_unlock(&mrt_lock);
1674 }
1675
1676 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1677 {
1678         if (v == SEQ_START_TOKEN) {
1679                 seq_puts(seq,
1680                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1681         } else {
1682                 const struct vif_device *vif = v;
1683                 const char *name =  vif->dev ? vif->dev->name : "none";
1684
1685                 seq_printf(seq,
1686                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1687                            vif - vif_table,
1688                            name, vif->bytes_in, vif->pkt_in,
1689                            vif->bytes_out, vif->pkt_out,
1690                            vif->flags, vif->local, vif->remote);
1691         }
1692         return 0;
1693 }
1694
1695 static const struct seq_operations ipmr_vif_seq_ops = {
1696         .start = ipmr_vif_seq_start,
1697         .next  = ipmr_vif_seq_next,
1698         .stop  = ipmr_vif_seq_stop,
1699         .show  = ipmr_vif_seq_show,
1700 };
1701
1702 static int ipmr_vif_open(struct inode *inode, struct file *file)
1703 {
1704         struct seq_file *seq;
1705         int rc = -ENOMEM;
1706         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1707
1708         if (!s)
1709                 goto out;
1710
1711         rc = seq_open(file, &ipmr_vif_seq_ops);
1712         if (rc)
1713                 goto out_kfree;
1714
1715         s->ct = 0;
1716         seq = file->private_data;
1717         seq->private = s;
1718 out:
1719         return rc;
1720 out_kfree:
1721         kfree(s);
1722         goto out;
1723
1724 }
1725
1726 static const struct file_operations ipmr_vif_fops = {
1727         .owner   = THIS_MODULE,
1728         .open    = ipmr_vif_open,
1729         .read    = seq_read,
1730         .llseek  = seq_lseek,
1731         .release = seq_release_private,
1732 };
1733
1734 struct ipmr_mfc_iter {
1735         struct mfc_cache **cache;
1736         int ct;
1737 };
1738
1739
1740 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1741 {
1742         struct mfc_cache *mfc;
1743
1744         it->cache = mfc_cache_array;
1745         read_lock(&mrt_lock);
1746         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1747                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1748                         if (pos-- == 0)
1749                                 return mfc;
1750         read_unlock(&mrt_lock);
1751
1752         it->cache = &mfc_unres_queue;
1753         spin_lock_bh(&mfc_unres_lock);
1754         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1755                 if (pos-- == 0)
1756                         return mfc;
1757         spin_unlock_bh(&mfc_unres_lock);
1758
1759         it->cache = NULL;
1760         return NULL;
1761 }
1762
1763
1764 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1765 {
1766         struct ipmr_mfc_iter *it = seq->private;
1767         it->cache = NULL;
1768         it->ct = 0;
1769         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1770                 : SEQ_START_TOKEN;
1771 }
1772
1773 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1774 {
1775         struct mfc_cache *mfc = v;
1776         struct ipmr_mfc_iter *it = seq->private;
1777
1778         ++*pos;
1779
1780         if (v == SEQ_START_TOKEN)
1781                 return ipmr_mfc_seq_idx(seq->private, 0);
1782
1783         if (mfc->next)
1784                 return mfc->next;
1785
1786         if (it->cache == &mfc_unres_queue)
1787                 goto end_of_list;
1788
1789         BUG_ON(it->cache != mfc_cache_array);
1790
1791         while (++it->ct < MFC_LINES) {
1792                 mfc = mfc_cache_array[it->ct];
1793                 if (mfc)
1794                         return mfc;
1795         }
1796
1797         /* exhausted cache_array, show unresolved */
1798         read_unlock(&mrt_lock);
1799         it->cache = &mfc_unres_queue;
1800         it->ct = 0;
1801
1802         spin_lock_bh(&mfc_unres_lock);
1803         mfc = mfc_unres_queue;
1804         if (mfc)
1805                 return mfc;
1806
1807  end_of_list:
1808         spin_unlock_bh(&mfc_unres_lock);
1809         it->cache = NULL;
1810
1811         return NULL;
1812 }
1813
1814 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1815 {
1816         struct ipmr_mfc_iter *it = seq->private;
1817
1818         if (it->cache == &mfc_unres_queue)
1819                 spin_unlock_bh(&mfc_unres_lock);
1820         else if (it->cache == mfc_cache_array)
1821                 read_unlock(&mrt_lock);
1822 }
1823
1824 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1825 {
1826         int n;
1827
1828         if (v == SEQ_START_TOKEN) {
1829                 seq_puts(seq,
1830                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1831         } else {
1832                 const struct mfc_cache *mfc = v;
1833                 const struct ipmr_mfc_iter *it = seq->private;
1834
1835                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1836                            (unsigned long) mfc->mfc_mcastgrp,
1837                            (unsigned long) mfc->mfc_origin,
1838                            mfc->mfc_parent,
1839                            mfc->mfc_un.res.pkt,
1840                            mfc->mfc_un.res.bytes,
1841                            mfc->mfc_un.res.wrong_if);
1842
1843                 if (it->cache != &mfc_unres_queue) {
1844                         for (n = mfc->mfc_un.res.minvif;
1845                              n < mfc->mfc_un.res.maxvif; n++ ) {
1846                                 if (VIF_EXISTS(n)
1847                                    && mfc->mfc_un.res.ttls[n] < 255)
1848                                 seq_printf(seq,
1849                                            " %2d:%-3d",
1850                                            n, mfc->mfc_un.res.ttls[n]);
1851                         }
1852                 }
1853                 seq_putc(seq, '\n');
1854         }
1855         return 0;
1856 }
1857
1858 static const struct seq_operations ipmr_mfc_seq_ops = {
1859         .start = ipmr_mfc_seq_start,
1860         .next  = ipmr_mfc_seq_next,
1861         .stop  = ipmr_mfc_seq_stop,
1862         .show  = ipmr_mfc_seq_show,
1863 };
1864
1865 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1866 {
1867         struct seq_file *seq;
1868         int rc = -ENOMEM;
1869         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1870
1871         if (!s)
1872                 goto out;
1873
1874         rc = seq_open(file, &ipmr_mfc_seq_ops);
1875         if (rc)
1876                 goto out_kfree;
1877
1878         seq = file->private_data;
1879         seq->private = s;
1880 out:
1881         return rc;
1882 out_kfree:
1883         kfree(s);
1884         goto out;
1885
1886 }
1887
1888 static const struct file_operations ipmr_mfc_fops = {
1889         .owner   = THIS_MODULE,
1890         .open    = ipmr_mfc_open,
1891         .read    = seq_read,
1892         .llseek  = seq_lseek,
1893         .release = seq_release_private,
1894 };
1895 #endif
1896
1897 #ifdef CONFIG_IP_PIMSM_V2
1898 static struct net_protocol pim_protocol = {
1899         .handler        =       pim_rcv,
1900 };
1901 #endif
1902
1903
1904 /*
1905  *      Setup for IP multicast routing
1906  */
1907
1908 void __init ip_mr_init(void)
1909 {
1910         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1911                                        sizeof(struct mfc_cache),
1912                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1913                                        NULL, NULL);
1914         init_timer(&ipmr_expire_timer);
1915         ipmr_expire_timer.function=ipmr_expire_process;
1916         register_netdevice_notifier(&ip_mr_notifier);
1917 #ifdef CONFIG_PROC_FS
1918         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1919         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1920 #endif
1921 }