net: Distributed Switch Architecture protocol support
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
122 {
123         dev_close(dev);
124
125         dev = __dev_get_by_name(&init_net, "tunl0");
126         if (dev) {
127                 struct ifreq ifr;
128                 mm_segment_t    oldfs;
129                 struct ip_tunnel_parm p;
130
131                 memset(&p, 0, sizeof(p));
132                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
133                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
134                 p.iph.version = 4;
135                 p.iph.ihl = 5;
136                 p.iph.protocol = IPPROTO_IPIP;
137                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
138                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
139
140                 oldfs = get_fs(); set_fs(KERNEL_DS);
141                 dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL);
142                 set_fs(oldfs);
143         }
144 }
145
146 static
147 struct net_device *ipmr_new_tunnel(struct vifctl *v)
148 {
149         struct net_device  *dev;
150
151         dev = __dev_get_by_name(&init_net, "tunl0");
152
153         if (dev) {
154                 int err;
155                 struct ifreq ifr;
156                 mm_segment_t    oldfs;
157                 struct ip_tunnel_parm p;
158                 struct in_device  *in_dev;
159
160                 memset(&p, 0, sizeof(p));
161                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
162                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
163                 p.iph.version = 4;
164                 p.iph.ihl = 5;
165                 p.iph.protocol = IPPROTO_IPIP;
166                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
167                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
168
169                 oldfs = get_fs(); set_fs(KERNEL_DS);
170                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
171                 set_fs(oldfs);
172
173                 dev = NULL;
174
175                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
176                         dev->flags |= IFF_MULTICAST;
177
178                         in_dev = __in_dev_get_rtnl(dev);
179                         if (in_dev == NULL)
180                                 goto failure;
181
182                         ipv4_devconf_setall(in_dev);
183                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
184
185                         if (dev_open(dev))
186                                 goto failure;
187                         dev_hold(dev);
188                 }
189         }
190         return dev;
191
192 failure:
193         /* allow the register to be completed before unregistering. */
194         rtnl_unlock();
195         rtnl_lock();
196
197         unregister_netdevice(dev);
198         return NULL;
199 }
200
201 #ifdef CONFIG_IP_PIMSM
202
203 static int reg_vif_num = -1;
204
205 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
206 {
207         read_lock(&mrt_lock);
208         dev->stats.tx_bytes += skb->len;
209         dev->stats.tx_packets++;
210         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
211         read_unlock(&mrt_lock);
212         kfree_skb(skb);
213         return 0;
214 }
215
216 static void reg_vif_setup(struct net_device *dev)
217 {
218         dev->type               = ARPHRD_PIMREG;
219         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
220         dev->flags              = IFF_NOARP;
221         dev->hard_start_xmit    = reg_vif_xmit;
222         dev->destructor         = free_netdev;
223 }
224
225 static struct net_device *ipmr_reg_vif(void)
226 {
227         struct net_device *dev;
228         struct in_device *in_dev;
229
230         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
231
232         if (dev == NULL)
233                 return NULL;
234
235         if (register_netdevice(dev)) {
236                 free_netdev(dev);
237                 return NULL;
238         }
239         dev->iflink = 0;
240
241         rcu_read_lock();
242         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
243                 rcu_read_unlock();
244                 goto failure;
245         }
246
247         ipv4_devconf_setall(in_dev);
248         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
249         rcu_read_unlock();
250
251         if (dev_open(dev))
252                 goto failure;
253
254         dev_hold(dev);
255
256         return dev;
257
258 failure:
259         /* allow the register to be completed before unregistering. */
260         rtnl_unlock();
261         rtnl_lock();
262
263         unregister_netdevice(dev);
264         return NULL;
265 }
266 #endif
267
268 /*
269  *      Delete a VIF entry
270  *      @notify: Set to 1, if the caller is a notifier_call
271  */
272
273 static int vif_delete(int vifi, int notify)
274 {
275         struct vif_device *v;
276         struct net_device *dev;
277         struct in_device *in_dev;
278
279         if (vifi < 0 || vifi >= maxvif)
280                 return -EADDRNOTAVAIL;
281
282         v = &vif_table[vifi];
283
284         write_lock_bh(&mrt_lock);
285         dev = v->dev;
286         v->dev = NULL;
287
288         if (!dev) {
289                 write_unlock_bh(&mrt_lock);
290                 return -EADDRNOTAVAIL;
291         }
292
293 #ifdef CONFIG_IP_PIMSM
294         if (vifi == reg_vif_num)
295                 reg_vif_num = -1;
296 #endif
297
298         if (vifi+1 == maxvif) {
299                 int tmp;
300                 for (tmp=vifi-1; tmp>=0; tmp--) {
301                         if (VIF_EXISTS(tmp))
302                                 break;
303                 }
304                 maxvif = tmp+1;
305         }
306
307         write_unlock_bh(&mrt_lock);
308
309         dev_set_allmulti(dev, -1);
310
311         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
312                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
313                 ip_rt_multicast_event(in_dev);
314         }
315
316         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
317                 unregister_netdevice(dev);
318
319         dev_put(dev);
320         return 0;
321 }
322
323 /* Destroy an unresolved cache entry, killing queued skbs
324    and reporting error to netlink readers.
325  */
326
327 static void ipmr_destroy_unres(struct mfc_cache *c)
328 {
329         struct sk_buff *skb;
330         struct nlmsgerr *e;
331
332         atomic_dec(&cache_resolve_queue_len);
333
334         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
335                 if (ip_hdr(skb)->version == 0) {
336                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
337                         nlh->nlmsg_type = NLMSG_ERROR;
338                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
339                         skb_trim(skb, nlh->nlmsg_len);
340                         e = NLMSG_DATA(nlh);
341                         e->error = -ETIMEDOUT;
342                         memset(&e->msg, 0, sizeof(e->msg));
343
344                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
345                 } else
346                         kfree_skb(skb);
347         }
348
349         kmem_cache_free(mrt_cachep, c);
350 }
351
352
353 /* Single timer process for all the unresolved queue. */
354
355 static void ipmr_expire_process(unsigned long dummy)
356 {
357         unsigned long now;
358         unsigned long expires;
359         struct mfc_cache *c, **cp;
360
361         if (!spin_trylock(&mfc_unres_lock)) {
362                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
363                 return;
364         }
365
366         if (atomic_read(&cache_resolve_queue_len) == 0)
367                 goto out;
368
369         now = jiffies;
370         expires = 10*HZ;
371         cp = &mfc_unres_queue;
372
373         while ((c=*cp) != NULL) {
374                 if (time_after(c->mfc_un.unres.expires, now)) {
375                         unsigned long interval = c->mfc_un.unres.expires - now;
376                         if (interval < expires)
377                                 expires = interval;
378                         cp = &c->next;
379                         continue;
380                 }
381
382                 *cp = c->next;
383
384                 ipmr_destroy_unres(c);
385         }
386
387         if (atomic_read(&cache_resolve_queue_len))
388                 mod_timer(&ipmr_expire_timer, jiffies + expires);
389
390 out:
391         spin_unlock(&mfc_unres_lock);
392 }
393
394 /* Fill oifs list. It is called under write locked mrt_lock. */
395
396 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
397 {
398         int vifi;
399
400         cache->mfc_un.res.minvif = MAXVIFS;
401         cache->mfc_un.res.maxvif = 0;
402         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
403
404         for (vifi=0; vifi<maxvif; vifi++) {
405                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
406                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
407                         if (cache->mfc_un.res.minvif > vifi)
408                                 cache->mfc_un.res.minvif = vifi;
409                         if (cache->mfc_un.res.maxvif <= vifi)
410                                 cache->mfc_un.res.maxvif = vifi + 1;
411                 }
412         }
413 }
414
415 static int vif_add(struct vifctl *vifc, int mrtsock)
416 {
417         int vifi = vifc->vifc_vifi;
418         struct vif_device *v = &vif_table[vifi];
419         struct net_device *dev;
420         struct in_device *in_dev;
421         int err;
422
423         /* Is vif busy ? */
424         if (VIF_EXISTS(vifi))
425                 return -EADDRINUSE;
426
427         switch (vifc->vifc_flags) {
428 #ifdef CONFIG_IP_PIMSM
429         case VIFF_REGISTER:
430                 /*
431                  * Special Purpose VIF in PIM
432                  * All the packets will be sent to the daemon
433                  */
434                 if (reg_vif_num >= 0)
435                         return -EADDRINUSE;
436                 dev = ipmr_reg_vif();
437                 if (!dev)
438                         return -ENOBUFS;
439                 err = dev_set_allmulti(dev, 1);
440                 if (err) {
441                         unregister_netdevice(dev);
442                         dev_put(dev);
443                         return err;
444                 }
445                 break;
446 #endif
447         case VIFF_TUNNEL:
448                 dev = ipmr_new_tunnel(vifc);
449                 if (!dev)
450                         return -ENOBUFS;
451                 err = dev_set_allmulti(dev, 1);
452                 if (err) {
453                         ipmr_del_tunnel(dev, vifc);
454                         dev_put(dev);
455                         return err;
456                 }
457                 break;
458         case 0:
459                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
460                 if (!dev)
461                         return -EADDRNOTAVAIL;
462                 err = dev_set_allmulti(dev, 1);
463                 if (err) {
464                         dev_put(dev);
465                         return err;
466                 }
467                 break;
468         default:
469                 return -EINVAL;
470         }
471
472         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
473                 return -EADDRNOTAVAIL;
474         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
475         ip_rt_multicast_event(in_dev);
476
477         /*
478          *      Fill in the VIF structures
479          */
480         v->rate_limit=vifc->vifc_rate_limit;
481         v->local=vifc->vifc_lcl_addr.s_addr;
482         v->remote=vifc->vifc_rmt_addr.s_addr;
483         v->flags=vifc->vifc_flags;
484         if (!mrtsock)
485                 v->flags |= VIFF_STATIC;
486         v->threshold=vifc->vifc_threshold;
487         v->bytes_in = 0;
488         v->bytes_out = 0;
489         v->pkt_in = 0;
490         v->pkt_out = 0;
491         v->link = dev->ifindex;
492         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
493                 v->link = dev->iflink;
494
495         /* And finish update writing critical data */
496         write_lock_bh(&mrt_lock);
497         v->dev=dev;
498 #ifdef CONFIG_IP_PIMSM
499         if (v->flags&VIFF_REGISTER)
500                 reg_vif_num = vifi;
501 #endif
502         if (vifi+1 > maxvif)
503                 maxvif = vifi+1;
504         write_unlock_bh(&mrt_lock);
505         return 0;
506 }
507
508 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
509 {
510         int line=MFC_HASH(mcastgrp,origin);
511         struct mfc_cache *c;
512
513         for (c=mfc_cache_array[line]; c; c = c->next) {
514                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
515                         break;
516         }
517         return c;
518 }
519
520 /*
521  *      Allocate a multicast cache entry
522  */
523 static struct mfc_cache *ipmr_cache_alloc(void)
524 {
525         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
526         if (c==NULL)
527                 return NULL;
528         c->mfc_un.res.minvif = MAXVIFS;
529         return c;
530 }
531
532 static struct mfc_cache *ipmr_cache_alloc_unres(void)
533 {
534         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
535         if (c==NULL)
536                 return NULL;
537         skb_queue_head_init(&c->mfc_un.unres.unresolved);
538         c->mfc_un.unres.expires = jiffies + 10*HZ;
539         return c;
540 }
541
542 /*
543  *      A cache entry has gone into a resolved state from queued
544  */
545
546 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
547 {
548         struct sk_buff *skb;
549         struct nlmsgerr *e;
550
551         /*
552          *      Play the pending entries through our router
553          */
554
555         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
556                 if (ip_hdr(skb)->version == 0) {
557                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
558
559                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
560                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
561                                                   (u8 *)nlh);
562                         } else {
563                                 nlh->nlmsg_type = NLMSG_ERROR;
564                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
565                                 skb_trim(skb, nlh->nlmsg_len);
566                                 e = NLMSG_DATA(nlh);
567                                 e->error = -EMSGSIZE;
568                                 memset(&e->msg, 0, sizeof(e->msg));
569                         }
570
571                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
572                 } else
573                         ip_mr_forward(skb, c, 0);
574         }
575 }
576
577 /*
578  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
579  *      expects the following bizarre scheme.
580  *
581  *      Called under mrt_lock.
582  */
583
584 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
585 {
586         struct sk_buff *skb;
587         const int ihl = ip_hdrlen(pkt);
588         struct igmphdr *igmp;
589         struct igmpmsg *msg;
590         int ret;
591
592 #ifdef CONFIG_IP_PIMSM
593         if (assert == IGMPMSG_WHOLEPKT)
594                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
595         else
596 #endif
597                 skb = alloc_skb(128, GFP_ATOMIC);
598
599         if (!skb)
600                 return -ENOBUFS;
601
602 #ifdef CONFIG_IP_PIMSM
603         if (assert == IGMPMSG_WHOLEPKT) {
604                 /* Ugly, but we have no choice with this interface.
605                    Duplicate old header, fix ihl, length etc.
606                    And all this only to mangle msg->im_msgtype and
607                    to set msg->im_mbz to "mbz" :-)
608                  */
609                 skb_push(skb, sizeof(struct iphdr));
610                 skb_reset_network_header(skb);
611                 skb_reset_transport_header(skb);
612                 msg = (struct igmpmsg *)skb_network_header(skb);
613                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
614                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
615                 msg->im_mbz = 0;
616                 msg->im_vif = reg_vif_num;
617                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
618                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
619                                              sizeof(struct iphdr));
620         } else
621 #endif
622         {
623
624         /*
625          *      Copy the IP header
626          */
627
628         skb->network_header = skb->tail;
629         skb_put(skb, ihl);
630         skb_copy_to_linear_data(skb, pkt->data, ihl);
631         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
632         msg = (struct igmpmsg *)skb_network_header(skb);
633         msg->im_vif = vifi;
634         skb->dst = dst_clone(pkt->dst);
635
636         /*
637          *      Add our header
638          */
639
640         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
641         igmp->type      =
642         msg->im_msgtype = assert;
643         igmp->code      =       0;
644         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
645         skb->transport_header = skb->network_header;
646         }
647
648         if (mroute_socket == NULL) {
649                 kfree_skb(skb);
650                 return -EINVAL;
651         }
652
653         /*
654          *      Deliver to mrouted
655          */
656         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
657                 if (net_ratelimit())
658                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
659                 kfree_skb(skb);
660         }
661
662         return ret;
663 }
664
665 /*
666  *      Queue a packet for resolution. It gets locked cache entry!
667  */
668
669 static int
670 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
671 {
672         int err;
673         struct mfc_cache *c;
674         const struct iphdr *iph = ip_hdr(skb);
675
676         spin_lock_bh(&mfc_unres_lock);
677         for (c=mfc_unres_queue; c; c=c->next) {
678                 if (c->mfc_mcastgrp == iph->daddr &&
679                     c->mfc_origin == iph->saddr)
680                         break;
681         }
682
683         if (c == NULL) {
684                 /*
685                  *      Create a new entry if allowable
686                  */
687
688                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
689                     (c=ipmr_cache_alloc_unres())==NULL) {
690                         spin_unlock_bh(&mfc_unres_lock);
691
692                         kfree_skb(skb);
693                         return -ENOBUFS;
694                 }
695
696                 /*
697                  *      Fill in the new cache entry
698                  */
699                 c->mfc_parent   = -1;
700                 c->mfc_origin   = iph->saddr;
701                 c->mfc_mcastgrp = iph->daddr;
702
703                 /*
704                  *      Reflect first query at mrouted.
705                  */
706                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
707                         /* If the report failed throw the cache entry
708                            out - Brad Parker
709                          */
710                         spin_unlock_bh(&mfc_unres_lock);
711
712                         kmem_cache_free(mrt_cachep, c);
713                         kfree_skb(skb);
714                         return err;
715                 }
716
717                 atomic_inc(&cache_resolve_queue_len);
718                 c->next = mfc_unres_queue;
719                 mfc_unres_queue = c;
720
721                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
722         }
723
724         /*
725          *      See if we can append the packet
726          */
727         if (c->mfc_un.unres.unresolved.qlen>3) {
728                 kfree_skb(skb);
729                 err = -ENOBUFS;
730         } else {
731                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
732                 err = 0;
733         }
734
735         spin_unlock_bh(&mfc_unres_lock);
736         return err;
737 }
738
739 /*
740  *      MFC cache manipulation by user space mroute daemon
741  */
742
743 static int ipmr_mfc_delete(struct mfcctl *mfc)
744 {
745         int line;
746         struct mfc_cache *c, **cp;
747
748         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
749
750         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
751                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
752                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
753                         write_lock_bh(&mrt_lock);
754                         *cp = c->next;
755                         write_unlock_bh(&mrt_lock);
756
757                         kmem_cache_free(mrt_cachep, c);
758                         return 0;
759                 }
760         }
761         return -ENOENT;
762 }
763
764 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
765 {
766         int line;
767         struct mfc_cache *uc, *c, **cp;
768
769         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
770
771         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
772                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
773                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
774                         break;
775         }
776
777         if (c != NULL) {
778                 write_lock_bh(&mrt_lock);
779                 c->mfc_parent = mfc->mfcc_parent;
780                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
781                 if (!mrtsock)
782                         c->mfc_flags |= MFC_STATIC;
783                 write_unlock_bh(&mrt_lock);
784                 return 0;
785         }
786
787         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
788                 return -EINVAL;
789
790         c=ipmr_cache_alloc();
791         if (c==NULL)
792                 return -ENOMEM;
793
794         c->mfc_origin=mfc->mfcc_origin.s_addr;
795         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
796         c->mfc_parent=mfc->mfcc_parent;
797         ipmr_update_thresholds(c, mfc->mfcc_ttls);
798         if (!mrtsock)
799                 c->mfc_flags |= MFC_STATIC;
800
801         write_lock_bh(&mrt_lock);
802         c->next = mfc_cache_array[line];
803         mfc_cache_array[line] = c;
804         write_unlock_bh(&mrt_lock);
805
806         /*
807          *      Check to see if we resolved a queued list. If so we
808          *      need to send on the frames and tidy up.
809          */
810         spin_lock_bh(&mfc_unres_lock);
811         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
812              cp = &uc->next) {
813                 if (uc->mfc_origin == c->mfc_origin &&
814                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
815                         *cp = uc->next;
816                         if (atomic_dec_and_test(&cache_resolve_queue_len))
817                                 del_timer(&ipmr_expire_timer);
818                         break;
819                 }
820         }
821         spin_unlock_bh(&mfc_unres_lock);
822
823         if (uc) {
824                 ipmr_cache_resolve(uc, c);
825                 kmem_cache_free(mrt_cachep, uc);
826         }
827         return 0;
828 }
829
830 /*
831  *      Close the multicast socket, and clear the vif tables etc
832  */
833
834 static void mroute_clean_tables(struct sock *sk)
835 {
836         int i;
837
838         /*
839          *      Shut down all active vif entries
840          */
841         for (i=0; i<maxvif; i++) {
842                 if (!(vif_table[i].flags&VIFF_STATIC))
843                         vif_delete(i, 0);
844         }
845
846         /*
847          *      Wipe the cache
848          */
849         for (i=0;i<MFC_LINES;i++) {
850                 struct mfc_cache *c, **cp;
851
852                 cp = &mfc_cache_array[i];
853                 while ((c = *cp) != NULL) {
854                         if (c->mfc_flags&MFC_STATIC) {
855                                 cp = &c->next;
856                                 continue;
857                         }
858                         write_lock_bh(&mrt_lock);
859                         *cp = c->next;
860                         write_unlock_bh(&mrt_lock);
861
862                         kmem_cache_free(mrt_cachep, c);
863                 }
864         }
865
866         if (atomic_read(&cache_resolve_queue_len) != 0) {
867                 struct mfc_cache *c;
868
869                 spin_lock_bh(&mfc_unres_lock);
870                 while (mfc_unres_queue != NULL) {
871                         c = mfc_unres_queue;
872                         mfc_unres_queue = c->next;
873                         spin_unlock_bh(&mfc_unres_lock);
874
875                         ipmr_destroy_unres(c);
876
877                         spin_lock_bh(&mfc_unres_lock);
878                 }
879                 spin_unlock_bh(&mfc_unres_lock);
880         }
881 }
882
883 static void mrtsock_destruct(struct sock *sk)
884 {
885         rtnl_lock();
886         if (sk == mroute_socket) {
887                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
888
889                 write_lock_bh(&mrt_lock);
890                 mroute_socket=NULL;
891                 write_unlock_bh(&mrt_lock);
892
893                 mroute_clean_tables(sk);
894         }
895         rtnl_unlock();
896 }
897
898 /*
899  *      Socket options and virtual interface manipulation. The whole
900  *      virtual interface system is a complete heap, but unfortunately
901  *      that's how BSD mrouted happens to think. Maybe one day with a proper
902  *      MOSPF/PIM router set up we can clean this up.
903  */
904
905 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
906 {
907         int ret;
908         struct vifctl vif;
909         struct mfcctl mfc;
910
911         if (optname != MRT_INIT) {
912                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
913                         return -EACCES;
914         }
915
916         switch (optname) {
917         case MRT_INIT:
918                 if (sk->sk_type != SOCK_RAW ||
919                     inet_sk(sk)->num != IPPROTO_IGMP)
920                         return -EOPNOTSUPP;
921                 if (optlen!=sizeof(int))
922                         return -ENOPROTOOPT;
923
924                 rtnl_lock();
925                 if (mroute_socket) {
926                         rtnl_unlock();
927                         return -EADDRINUSE;
928                 }
929
930                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
931                 if (ret == 0) {
932                         write_lock_bh(&mrt_lock);
933                         mroute_socket=sk;
934                         write_unlock_bh(&mrt_lock);
935
936                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
937                 }
938                 rtnl_unlock();
939                 return ret;
940         case MRT_DONE:
941                 if (sk!=mroute_socket)
942                         return -EACCES;
943                 return ip_ra_control(sk, 0, NULL);
944         case MRT_ADD_VIF:
945         case MRT_DEL_VIF:
946                 if (optlen!=sizeof(vif))
947                         return -EINVAL;
948                 if (copy_from_user(&vif,optval,sizeof(vif)))
949                         return -EFAULT;
950                 if (vif.vifc_vifi >= MAXVIFS)
951                         return -ENFILE;
952                 rtnl_lock();
953                 if (optname==MRT_ADD_VIF) {
954                         ret = vif_add(&vif, sk==mroute_socket);
955                 } else {
956                         ret = vif_delete(vif.vifc_vifi, 0);
957                 }
958                 rtnl_unlock();
959                 return ret;
960
961                 /*
962                  *      Manipulate the forwarding caches. These live
963                  *      in a sort of kernel/user symbiosis.
964                  */
965         case MRT_ADD_MFC:
966         case MRT_DEL_MFC:
967                 if (optlen!=sizeof(mfc))
968                         return -EINVAL;
969                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
970                         return -EFAULT;
971                 rtnl_lock();
972                 if (optname==MRT_DEL_MFC)
973                         ret = ipmr_mfc_delete(&mfc);
974                 else
975                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
976                 rtnl_unlock();
977                 return ret;
978                 /*
979                  *      Control PIM assert.
980                  */
981         case MRT_ASSERT:
982         {
983                 int v;
984                 if (get_user(v,(int __user *)optval))
985                         return -EFAULT;
986                 mroute_do_assert=(v)?1:0;
987                 return 0;
988         }
989 #ifdef CONFIG_IP_PIMSM
990         case MRT_PIM:
991         {
992                 int v;
993
994                 if (get_user(v,(int __user *)optval))
995                         return -EFAULT;
996                 v = (v) ? 1 : 0;
997
998                 rtnl_lock();
999                 ret = 0;
1000                 if (v != mroute_do_pim) {
1001                         mroute_do_pim = v;
1002                         mroute_do_assert = v;
1003 #ifdef CONFIG_IP_PIMSM_V2
1004                         if (mroute_do_pim)
1005                                 ret = inet_add_protocol(&pim_protocol,
1006                                                         IPPROTO_PIM);
1007                         else
1008                                 ret = inet_del_protocol(&pim_protocol,
1009                                                         IPPROTO_PIM);
1010                         if (ret < 0)
1011                                 ret = -EAGAIN;
1012 #endif
1013                 }
1014                 rtnl_unlock();
1015                 return ret;
1016         }
1017 #endif
1018         /*
1019          *      Spurious command, or MRT_VERSION which you cannot
1020          *      set.
1021          */
1022         default:
1023                 return -ENOPROTOOPT;
1024         }
1025 }
1026
1027 /*
1028  *      Getsock opt support for the multicast routing system.
1029  */
1030
1031 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
1032 {
1033         int olr;
1034         int val;
1035
1036         if (optname!=MRT_VERSION &&
1037 #ifdef CONFIG_IP_PIMSM
1038            optname!=MRT_PIM &&
1039 #endif
1040            optname!=MRT_ASSERT)
1041                 return -ENOPROTOOPT;
1042
1043         if (get_user(olr, optlen))
1044                 return -EFAULT;
1045
1046         olr = min_t(unsigned int, olr, sizeof(int));
1047         if (olr < 0)
1048                 return -EINVAL;
1049
1050         if (put_user(olr,optlen))
1051                 return -EFAULT;
1052         if (optname==MRT_VERSION)
1053                 val=0x0305;
1054 #ifdef CONFIG_IP_PIMSM
1055         else if (optname==MRT_PIM)
1056                 val=mroute_do_pim;
1057 #endif
1058         else
1059                 val=mroute_do_assert;
1060         if (copy_to_user(optval,&val,olr))
1061                 return -EFAULT;
1062         return 0;
1063 }
1064
1065 /*
1066  *      The IP multicast ioctl support routines.
1067  */
1068
1069 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1070 {
1071         struct sioc_sg_req sr;
1072         struct sioc_vif_req vr;
1073         struct vif_device *vif;
1074         struct mfc_cache *c;
1075
1076         switch (cmd) {
1077         case SIOCGETVIFCNT:
1078                 if (copy_from_user(&vr,arg,sizeof(vr)))
1079                         return -EFAULT;
1080                 if (vr.vifi>=maxvif)
1081                         return -EINVAL;
1082                 read_lock(&mrt_lock);
1083                 vif=&vif_table[vr.vifi];
1084                 if (VIF_EXISTS(vr.vifi))        {
1085                         vr.icount=vif->pkt_in;
1086                         vr.ocount=vif->pkt_out;
1087                         vr.ibytes=vif->bytes_in;
1088                         vr.obytes=vif->bytes_out;
1089                         read_unlock(&mrt_lock);
1090
1091                         if (copy_to_user(arg,&vr,sizeof(vr)))
1092                                 return -EFAULT;
1093                         return 0;
1094                 }
1095                 read_unlock(&mrt_lock);
1096                 return -EADDRNOTAVAIL;
1097         case SIOCGETSGCNT:
1098                 if (copy_from_user(&sr,arg,sizeof(sr)))
1099                         return -EFAULT;
1100
1101                 read_lock(&mrt_lock);
1102                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1103                 if (c) {
1104                         sr.pktcnt = c->mfc_un.res.pkt;
1105                         sr.bytecnt = c->mfc_un.res.bytes;
1106                         sr.wrong_if = c->mfc_un.res.wrong_if;
1107                         read_unlock(&mrt_lock);
1108
1109                         if (copy_to_user(arg,&sr,sizeof(sr)))
1110                                 return -EFAULT;
1111                         return 0;
1112                 }
1113                 read_unlock(&mrt_lock);
1114                 return -EADDRNOTAVAIL;
1115         default:
1116                 return -ENOIOCTLCMD;
1117         }
1118 }
1119
1120
1121 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1122 {
1123         struct net_device *dev = ptr;
1124         struct vif_device *v;
1125         int ct;
1126
1127         if (!net_eq(dev_net(dev), &init_net))
1128                 return NOTIFY_DONE;
1129
1130         if (event != NETDEV_UNREGISTER)
1131                 return NOTIFY_DONE;
1132         v=&vif_table[0];
1133         for (ct=0;ct<maxvif;ct++,v++) {
1134                 if (v->dev==dev)
1135                         vif_delete(ct, 1);
1136         }
1137         return NOTIFY_DONE;
1138 }
1139
1140
1141 static struct notifier_block ip_mr_notifier={
1142         .notifier_call = ipmr_device_event,
1143 };
1144
1145 /*
1146  *      Encapsulate a packet by attaching a valid IPIP header to it.
1147  *      This avoids tunnel drivers and other mess and gives us the speed so
1148  *      important for multicast video.
1149  */
1150
1151 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1152 {
1153         struct iphdr *iph;
1154         struct iphdr *old_iph = ip_hdr(skb);
1155
1156         skb_push(skb, sizeof(struct iphdr));
1157         skb->transport_header = skb->network_header;
1158         skb_reset_network_header(skb);
1159         iph = ip_hdr(skb);
1160
1161         iph->version    =       4;
1162         iph->tos        =       old_iph->tos;
1163         iph->ttl        =       old_iph->ttl;
1164         iph->frag_off   =       0;
1165         iph->daddr      =       daddr;
1166         iph->saddr      =       saddr;
1167         iph->protocol   =       IPPROTO_IPIP;
1168         iph->ihl        =       5;
1169         iph->tot_len    =       htons(skb->len);
1170         ip_select_ident(iph, skb->dst, NULL);
1171         ip_send_check(iph);
1172
1173         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1174         nf_reset(skb);
1175 }
1176
1177 static inline int ipmr_forward_finish(struct sk_buff *skb)
1178 {
1179         struct ip_options * opt = &(IPCB(skb)->opt);
1180
1181         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1182
1183         if (unlikely(opt->optlen))
1184                 ip_forward_options(skb);
1185
1186         return dst_output(skb);
1187 }
1188
1189 /*
1190  *      Processing handlers for ipmr_forward
1191  */
1192
1193 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1194 {
1195         const struct iphdr *iph = ip_hdr(skb);
1196         struct vif_device *vif = &vif_table[vifi];
1197         struct net_device *dev;
1198         struct rtable *rt;
1199         int    encap = 0;
1200
1201         if (vif->dev == NULL)
1202                 goto out_free;
1203
1204 #ifdef CONFIG_IP_PIMSM
1205         if (vif->flags & VIFF_REGISTER) {
1206                 vif->pkt_out++;
1207                 vif->bytes_out+=skb->len;
1208                 vif->dev->stats.tx_bytes += skb->len;
1209                 vif->dev->stats.tx_packets++;
1210                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1211                 kfree_skb(skb);
1212                 return;
1213         }
1214 #endif
1215
1216         if (vif->flags&VIFF_TUNNEL) {
1217                 struct flowi fl = { .oif = vif->link,
1218                                     .nl_u = { .ip4_u =
1219                                               { .daddr = vif->remote,
1220                                                 .saddr = vif->local,
1221                                                 .tos = RT_TOS(iph->tos) } },
1222                                     .proto = IPPROTO_IPIP };
1223                 if (ip_route_output_key(&init_net, &rt, &fl))
1224                         goto out_free;
1225                 encap = sizeof(struct iphdr);
1226         } else {
1227                 struct flowi fl = { .oif = vif->link,
1228                                     .nl_u = { .ip4_u =
1229                                               { .daddr = iph->daddr,
1230                                                 .tos = RT_TOS(iph->tos) } },
1231                                     .proto = IPPROTO_IPIP };
1232                 if (ip_route_output_key(&init_net, &rt, &fl))
1233                         goto out_free;
1234         }
1235
1236         dev = rt->u.dst.dev;
1237
1238         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1239                 /* Do not fragment multicasts. Alas, IPv4 does not
1240                    allow to send ICMP, so that packets will disappear
1241                    to blackhole.
1242                  */
1243
1244                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1245                 ip_rt_put(rt);
1246                 goto out_free;
1247         }
1248
1249         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1250
1251         if (skb_cow(skb, encap)) {
1252                 ip_rt_put(rt);
1253                 goto out_free;
1254         }
1255
1256         vif->pkt_out++;
1257         vif->bytes_out+=skb->len;
1258
1259         dst_release(skb->dst);
1260         skb->dst = &rt->u.dst;
1261         ip_decrease_ttl(ip_hdr(skb));
1262
1263         /* FIXME: forward and output firewalls used to be called here.
1264          * What do we do with netfilter? -- RR */
1265         if (vif->flags & VIFF_TUNNEL) {
1266                 ip_encap(skb, vif->local, vif->remote);
1267                 /* FIXME: extra output firewall step used to be here. --RR */
1268                 vif->dev->stats.tx_packets++;
1269                 vif->dev->stats.tx_bytes += skb->len;
1270         }
1271
1272         IPCB(skb)->flags |= IPSKB_FORWARDED;
1273
1274         /*
1275          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1276          * not only before forwarding, but after forwarding on all output
1277          * interfaces. It is clear, if mrouter runs a multicasting
1278          * program, it should receive packets not depending to what interface
1279          * program is joined.
1280          * If we will not make it, the program will have to join on all
1281          * interfaces. On the other hand, multihoming host (or router, but
1282          * not mrouter) cannot join to more than one interface - it will
1283          * result in receiving multiple packets.
1284          */
1285         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1286                 ipmr_forward_finish);
1287         return;
1288
1289 out_free:
1290         kfree_skb(skb);
1291         return;
1292 }
1293
1294 static int ipmr_find_vif(struct net_device *dev)
1295 {
1296         int ct;
1297         for (ct=maxvif-1; ct>=0; ct--) {
1298                 if (vif_table[ct].dev == dev)
1299                         break;
1300         }
1301         return ct;
1302 }
1303
1304 /* "local" means that we should preserve one skb (for local delivery) */
1305
1306 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1307 {
1308         int psend = -1;
1309         int vif, ct;
1310
1311         vif = cache->mfc_parent;
1312         cache->mfc_un.res.pkt++;
1313         cache->mfc_un.res.bytes += skb->len;
1314
1315         /*
1316          * Wrong interface: drop packet and (maybe) send PIM assert.
1317          */
1318         if (vif_table[vif].dev != skb->dev) {
1319                 int true_vifi;
1320
1321                 if (skb->rtable->fl.iif == 0) {
1322                         /* It is our own packet, looped back.
1323                            Very complicated situation...
1324
1325                            The best workaround until routing daemons will be
1326                            fixed is not to redistribute packet, if it was
1327                            send through wrong interface. It means, that
1328                            multicast applications WILL NOT work for
1329                            (S,G), which have default multicast route pointing
1330                            to wrong oif. In any case, it is not a good
1331                            idea to use multicasting applications on router.
1332                          */
1333                         goto dont_forward;
1334                 }
1335
1336                 cache->mfc_un.res.wrong_if++;
1337                 true_vifi = ipmr_find_vif(skb->dev);
1338
1339                 if (true_vifi >= 0 && mroute_do_assert &&
1340                     /* pimsm uses asserts, when switching from RPT to SPT,
1341                        so that we cannot check that packet arrived on an oif.
1342                        It is bad, but otherwise we would need to move pretty
1343                        large chunk of pimd to kernel. Ough... --ANK
1344                      */
1345                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1346                     time_after(jiffies,
1347                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1348                         cache->mfc_un.res.last_assert = jiffies;
1349                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1350                 }
1351                 goto dont_forward;
1352         }
1353
1354         vif_table[vif].pkt_in++;
1355         vif_table[vif].bytes_in+=skb->len;
1356
1357         /*
1358          *      Forward the frame
1359          */
1360         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1361                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1362                         if (psend != -1) {
1363                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1364                                 if (skb2)
1365                                         ipmr_queue_xmit(skb2, cache, psend);
1366                         }
1367                         psend=ct;
1368                 }
1369         }
1370         if (psend != -1) {
1371                 if (local) {
1372                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1373                         if (skb2)
1374                                 ipmr_queue_xmit(skb2, cache, psend);
1375                 } else {
1376                         ipmr_queue_xmit(skb, cache, psend);
1377                         return 0;
1378                 }
1379         }
1380
1381 dont_forward:
1382         if (!local)
1383                 kfree_skb(skb);
1384         return 0;
1385 }
1386
1387
1388 /*
1389  *      Multicast packets for forwarding arrive here
1390  */
1391
1392 int ip_mr_input(struct sk_buff *skb)
1393 {
1394         struct mfc_cache *cache;
1395         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1396
1397         /* Packet is looped back after forward, it should not be
1398            forwarded second time, but still can be delivered locally.
1399          */
1400         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1401                 goto dont_forward;
1402
1403         if (!local) {
1404                     if (IPCB(skb)->opt.router_alert) {
1405                             if (ip_call_ra_chain(skb))
1406                                     return 0;
1407                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1408                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1409                                Cisco IOS <= 11.2(8)) do not put router alert
1410                                option to IGMP packets destined to routable
1411                                groups. It is very bad, because it means
1412                                that we can forward NO IGMP messages.
1413                              */
1414                             read_lock(&mrt_lock);
1415                             if (mroute_socket) {
1416                                     nf_reset(skb);
1417                                     raw_rcv(mroute_socket, skb);
1418                                     read_unlock(&mrt_lock);
1419                                     return 0;
1420                             }
1421                             read_unlock(&mrt_lock);
1422                     }
1423         }
1424
1425         read_lock(&mrt_lock);
1426         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1427
1428         /*
1429          *      No usable cache entry
1430          */
1431         if (cache==NULL) {
1432                 int vif;
1433
1434                 if (local) {
1435                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1436                         ip_local_deliver(skb);
1437                         if (skb2 == NULL) {
1438                                 read_unlock(&mrt_lock);
1439                                 return -ENOBUFS;
1440                         }
1441                         skb = skb2;
1442                 }
1443
1444                 vif = ipmr_find_vif(skb->dev);
1445                 if (vif >= 0) {
1446                         int err = ipmr_cache_unresolved(vif, skb);
1447                         read_unlock(&mrt_lock);
1448
1449                         return err;
1450                 }
1451                 read_unlock(&mrt_lock);
1452                 kfree_skb(skb);
1453                 return -ENODEV;
1454         }
1455
1456         ip_mr_forward(skb, cache, local);
1457
1458         read_unlock(&mrt_lock);
1459
1460         if (local)
1461                 return ip_local_deliver(skb);
1462
1463         return 0;
1464
1465 dont_forward:
1466         if (local)
1467                 return ip_local_deliver(skb);
1468         kfree_skb(skb);
1469         return 0;
1470 }
1471
1472 #ifdef CONFIG_IP_PIMSM_V1
1473 /*
1474  * Handle IGMP messages of PIMv1
1475  */
1476
1477 int pim_rcv_v1(struct sk_buff * skb)
1478 {
1479         struct igmphdr *pim;
1480         struct iphdr   *encap;
1481         struct net_device  *reg_dev = NULL;
1482
1483         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1484                 goto drop;
1485
1486         pim = igmp_hdr(skb);
1487
1488         if (!mroute_do_pim ||
1489             skb->len < sizeof(*pim) + sizeof(*encap) ||
1490             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1491                 goto drop;
1492
1493         encap = (struct iphdr *)(skb_transport_header(skb) +
1494                                  sizeof(struct igmphdr));
1495         /*
1496            Check that:
1497            a. packet is really destinted to a multicast group
1498            b. packet is not a NULL-REGISTER
1499            c. packet is not truncated
1500          */
1501         if (!ipv4_is_multicast(encap->daddr) ||
1502             encap->tot_len == 0 ||
1503             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1504                 goto drop;
1505
1506         read_lock(&mrt_lock);
1507         if (reg_vif_num >= 0)
1508                 reg_dev = vif_table[reg_vif_num].dev;
1509         if (reg_dev)
1510                 dev_hold(reg_dev);
1511         read_unlock(&mrt_lock);
1512
1513         if (reg_dev == NULL)
1514                 goto drop;
1515
1516         skb->mac_header = skb->network_header;
1517         skb_pull(skb, (u8*)encap - skb->data);
1518         skb_reset_network_header(skb);
1519         skb->dev = reg_dev;
1520         skb->protocol = htons(ETH_P_IP);
1521         skb->ip_summed = 0;
1522         skb->pkt_type = PACKET_HOST;
1523         dst_release(skb->dst);
1524         skb->dst = NULL;
1525         reg_dev->stats.rx_bytes += skb->len;
1526         reg_dev->stats.rx_packets++;
1527         nf_reset(skb);
1528         netif_rx(skb);
1529         dev_put(reg_dev);
1530         return 0;
1531  drop:
1532         kfree_skb(skb);
1533         return 0;
1534 }
1535 #endif
1536
1537 #ifdef CONFIG_IP_PIMSM_V2
1538 static int pim_rcv(struct sk_buff * skb)
1539 {
1540         struct pimreghdr *pim;
1541         struct iphdr   *encap;
1542         struct net_device  *reg_dev = NULL;
1543
1544         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1545                 goto drop;
1546
1547         pim = (struct pimreghdr *)skb_transport_header(skb);
1548         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1549             (pim->flags&PIM_NULL_REGISTER) ||
1550             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1551              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1552                 goto drop;
1553
1554         /* check if the inner packet is destined to mcast group */
1555         encap = (struct iphdr *)(skb_transport_header(skb) +
1556                                  sizeof(struct pimreghdr));
1557         if (!ipv4_is_multicast(encap->daddr) ||
1558             encap->tot_len == 0 ||
1559             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1560                 goto drop;
1561
1562         read_lock(&mrt_lock);
1563         if (reg_vif_num >= 0)
1564                 reg_dev = vif_table[reg_vif_num].dev;
1565         if (reg_dev)
1566                 dev_hold(reg_dev);
1567         read_unlock(&mrt_lock);
1568
1569         if (reg_dev == NULL)
1570                 goto drop;
1571
1572         skb->mac_header = skb->network_header;
1573         skb_pull(skb, (u8*)encap - skb->data);
1574         skb_reset_network_header(skb);
1575         skb->dev = reg_dev;
1576         skb->protocol = htons(ETH_P_IP);
1577         skb->ip_summed = 0;
1578         skb->pkt_type = PACKET_HOST;
1579         dst_release(skb->dst);
1580         reg_dev->stats.rx_bytes += skb->len;
1581         reg_dev->stats.rx_packets++;
1582         skb->dst = NULL;
1583         nf_reset(skb);
1584         netif_rx(skb);
1585         dev_put(reg_dev);
1586         return 0;
1587  drop:
1588         kfree_skb(skb);
1589         return 0;
1590 }
1591 #endif
1592
1593 static int
1594 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1595 {
1596         int ct;
1597         struct rtnexthop *nhp;
1598         struct net_device *dev = vif_table[c->mfc_parent].dev;
1599         u8 *b = skb_tail_pointer(skb);
1600         struct rtattr *mp_head;
1601
1602         if (dev)
1603                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1604
1605         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1606
1607         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1608                 if (c->mfc_un.res.ttls[ct] < 255) {
1609                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1610                                 goto rtattr_failure;
1611                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1612                         nhp->rtnh_flags = 0;
1613                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1614                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1615                         nhp->rtnh_len = sizeof(*nhp);
1616                 }
1617         }
1618         mp_head->rta_type = RTA_MULTIPATH;
1619         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1620         rtm->rtm_type = RTN_MULTICAST;
1621         return 1;
1622
1623 rtattr_failure:
1624         nlmsg_trim(skb, b);
1625         return -EMSGSIZE;
1626 }
1627
1628 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1629 {
1630         int err;
1631         struct mfc_cache *cache;
1632         struct rtable *rt = skb->rtable;
1633
1634         read_lock(&mrt_lock);
1635         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1636
1637         if (cache==NULL) {
1638                 struct sk_buff *skb2;
1639                 struct iphdr *iph;
1640                 struct net_device *dev;
1641                 int vif;
1642
1643                 if (nowait) {
1644                         read_unlock(&mrt_lock);
1645                         return -EAGAIN;
1646                 }
1647
1648                 dev = skb->dev;
1649                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1650                         read_unlock(&mrt_lock);
1651                         return -ENODEV;
1652                 }
1653                 skb2 = skb_clone(skb, GFP_ATOMIC);
1654                 if (!skb2) {
1655                         read_unlock(&mrt_lock);
1656                         return -ENOMEM;
1657                 }
1658
1659                 skb_push(skb2, sizeof(struct iphdr));
1660                 skb_reset_network_header(skb2);
1661                 iph = ip_hdr(skb2);
1662                 iph->ihl = sizeof(struct iphdr) >> 2;
1663                 iph->saddr = rt->rt_src;
1664                 iph->daddr = rt->rt_dst;
1665                 iph->version = 0;
1666                 err = ipmr_cache_unresolved(vif, skb2);
1667                 read_unlock(&mrt_lock);
1668                 return err;
1669         }
1670
1671         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1672                 cache->mfc_flags |= MFC_NOTIFY;
1673         err = ipmr_fill_mroute(skb, cache, rtm);
1674         read_unlock(&mrt_lock);
1675         return err;
1676 }
1677
1678 #ifdef CONFIG_PROC_FS
1679 /*
1680  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1681  */
1682 struct ipmr_vif_iter {
1683         int ct;
1684 };
1685
1686 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1687                                            loff_t pos)
1688 {
1689         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1690                 if (!VIF_EXISTS(iter->ct))
1691                         continue;
1692                 if (pos-- == 0)
1693                         return &vif_table[iter->ct];
1694         }
1695         return NULL;
1696 }
1697
1698 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1699         __acquires(mrt_lock)
1700 {
1701         read_lock(&mrt_lock);
1702         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1703                 : SEQ_START_TOKEN;
1704 }
1705
1706 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1707 {
1708         struct ipmr_vif_iter *iter = seq->private;
1709
1710         ++*pos;
1711         if (v == SEQ_START_TOKEN)
1712                 return ipmr_vif_seq_idx(iter, 0);
1713
1714         while (++iter->ct < maxvif) {
1715                 if (!VIF_EXISTS(iter->ct))
1716                         continue;
1717                 return &vif_table[iter->ct];
1718         }
1719         return NULL;
1720 }
1721
1722 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1723         __releases(mrt_lock)
1724 {
1725         read_unlock(&mrt_lock);
1726 }
1727
1728 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1729 {
1730         if (v == SEQ_START_TOKEN) {
1731                 seq_puts(seq,
1732                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1733         } else {
1734                 const struct vif_device *vif = v;
1735                 const char *name =  vif->dev ? vif->dev->name : "none";
1736
1737                 seq_printf(seq,
1738                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1739                            vif - vif_table,
1740                            name, vif->bytes_in, vif->pkt_in,
1741                            vif->bytes_out, vif->pkt_out,
1742                            vif->flags, vif->local, vif->remote);
1743         }
1744         return 0;
1745 }
1746
1747 static const struct seq_operations ipmr_vif_seq_ops = {
1748         .start = ipmr_vif_seq_start,
1749         .next  = ipmr_vif_seq_next,
1750         .stop  = ipmr_vif_seq_stop,
1751         .show  = ipmr_vif_seq_show,
1752 };
1753
1754 static int ipmr_vif_open(struct inode *inode, struct file *file)
1755 {
1756         return seq_open_private(file, &ipmr_vif_seq_ops,
1757                         sizeof(struct ipmr_vif_iter));
1758 }
1759
1760 static const struct file_operations ipmr_vif_fops = {
1761         .owner   = THIS_MODULE,
1762         .open    = ipmr_vif_open,
1763         .read    = seq_read,
1764         .llseek  = seq_lseek,
1765         .release = seq_release_private,
1766 };
1767
1768 struct ipmr_mfc_iter {
1769         struct mfc_cache **cache;
1770         int ct;
1771 };
1772
1773
1774 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1775 {
1776         struct mfc_cache *mfc;
1777
1778         it->cache = mfc_cache_array;
1779         read_lock(&mrt_lock);
1780         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1781                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1782                         if (pos-- == 0)
1783                                 return mfc;
1784         read_unlock(&mrt_lock);
1785
1786         it->cache = &mfc_unres_queue;
1787         spin_lock_bh(&mfc_unres_lock);
1788         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1789                 if (pos-- == 0)
1790                         return mfc;
1791         spin_unlock_bh(&mfc_unres_lock);
1792
1793         it->cache = NULL;
1794         return NULL;
1795 }
1796
1797
1798 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1799 {
1800         struct ipmr_mfc_iter *it = seq->private;
1801         it->cache = NULL;
1802         it->ct = 0;
1803         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1804                 : SEQ_START_TOKEN;
1805 }
1806
1807 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1808 {
1809         struct mfc_cache *mfc = v;
1810         struct ipmr_mfc_iter *it = seq->private;
1811
1812         ++*pos;
1813
1814         if (v == SEQ_START_TOKEN)
1815                 return ipmr_mfc_seq_idx(seq->private, 0);
1816
1817         if (mfc->next)
1818                 return mfc->next;
1819
1820         if (it->cache == &mfc_unres_queue)
1821                 goto end_of_list;
1822
1823         BUG_ON(it->cache != mfc_cache_array);
1824
1825         while (++it->ct < MFC_LINES) {
1826                 mfc = mfc_cache_array[it->ct];
1827                 if (mfc)
1828                         return mfc;
1829         }
1830
1831         /* exhausted cache_array, show unresolved */
1832         read_unlock(&mrt_lock);
1833         it->cache = &mfc_unres_queue;
1834         it->ct = 0;
1835
1836         spin_lock_bh(&mfc_unres_lock);
1837         mfc = mfc_unres_queue;
1838         if (mfc)
1839                 return mfc;
1840
1841  end_of_list:
1842         spin_unlock_bh(&mfc_unres_lock);
1843         it->cache = NULL;
1844
1845         return NULL;
1846 }
1847
1848 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1849 {
1850         struct ipmr_mfc_iter *it = seq->private;
1851
1852         if (it->cache == &mfc_unres_queue)
1853                 spin_unlock_bh(&mfc_unres_lock);
1854         else if (it->cache == mfc_cache_array)
1855                 read_unlock(&mrt_lock);
1856 }
1857
1858 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1859 {
1860         int n;
1861
1862         if (v == SEQ_START_TOKEN) {
1863                 seq_puts(seq,
1864                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1865         } else {
1866                 const struct mfc_cache *mfc = v;
1867                 const struct ipmr_mfc_iter *it = seq->private;
1868
1869                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1870                            (unsigned long) mfc->mfc_mcastgrp,
1871                            (unsigned long) mfc->mfc_origin,
1872                            mfc->mfc_parent,
1873                            mfc->mfc_un.res.pkt,
1874                            mfc->mfc_un.res.bytes,
1875                            mfc->mfc_un.res.wrong_if);
1876
1877                 if (it->cache != &mfc_unres_queue) {
1878                         for (n = mfc->mfc_un.res.minvif;
1879                              n < mfc->mfc_un.res.maxvif; n++ ) {
1880                                 if (VIF_EXISTS(n)
1881                                    && mfc->mfc_un.res.ttls[n] < 255)
1882                                 seq_printf(seq,
1883                                            " %2d:%-3d",
1884                                            n, mfc->mfc_un.res.ttls[n]);
1885                         }
1886                 }
1887                 seq_putc(seq, '\n');
1888         }
1889         return 0;
1890 }
1891
1892 static const struct seq_operations ipmr_mfc_seq_ops = {
1893         .start = ipmr_mfc_seq_start,
1894         .next  = ipmr_mfc_seq_next,
1895         .stop  = ipmr_mfc_seq_stop,
1896         .show  = ipmr_mfc_seq_show,
1897 };
1898
1899 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1900 {
1901         return seq_open_private(file, &ipmr_mfc_seq_ops,
1902                         sizeof(struct ipmr_mfc_iter));
1903 }
1904
1905 static const struct file_operations ipmr_mfc_fops = {
1906         .owner   = THIS_MODULE,
1907         .open    = ipmr_mfc_open,
1908         .read    = seq_read,
1909         .llseek  = seq_lseek,
1910         .release = seq_release_private,
1911 };
1912 #endif
1913
1914 #ifdef CONFIG_IP_PIMSM_V2
1915 static struct net_protocol pim_protocol = {
1916         .handler        =       pim_rcv,
1917 };
1918 #endif
1919
1920
1921 /*
1922  *      Setup for IP multicast routing
1923  */
1924
1925 int __init ip_mr_init(void)
1926 {
1927         int err;
1928
1929         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1930                                        sizeof(struct mfc_cache),
1931                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1932                                        NULL);
1933         if (!mrt_cachep)
1934                 return -ENOMEM;
1935
1936         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1937         err = register_netdevice_notifier(&ip_mr_notifier);
1938         if (err)
1939                 goto reg_notif_fail;
1940 #ifdef CONFIG_PROC_FS
1941         err = -ENOMEM;
1942         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1943                 goto proc_vif_fail;
1944         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1945                 goto proc_cache_fail;
1946 #endif
1947         return 0;
1948 reg_notif_fail:
1949         kmem_cache_destroy(mrt_cachep);
1950 #ifdef CONFIG_PROC_FS
1951 proc_vif_fail:
1952         unregister_netdevice_notifier(&ip_mr_notifier);
1953 proc_cache_fail:
1954         proc_net_remove(&init_net, "ip_mr_vif");
1955 #endif
1956         return err;
1957 }