Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
122 {
123         dev_close(dev);
124
125         dev = __dev_get_by_name(&init_net, "tunl0");
126         if (dev) {
127                 const struct net_device_ops *ops = dev->netdev_ops;
128                 struct ifreq ifr;
129                 struct ip_tunnel_parm p;
130
131                 memset(&p, 0, sizeof(p));
132                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
133                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
134                 p.iph.version = 4;
135                 p.iph.ihl = 5;
136                 p.iph.protocol = IPPROTO_IPIP;
137                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
138                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
139
140                 if (ops->ndo_do_ioctl) {
141                         mm_segment_t oldfs = get_fs();
142
143                         set_fs(KERNEL_DS);
144                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
145                         set_fs(oldfs);
146                 }
147         }
148 }
149
150 static
151 struct net_device *ipmr_new_tunnel(struct vifctl *v)
152 {
153         struct net_device  *dev;
154
155         dev = __dev_get_by_name(&init_net, "tunl0");
156
157         if (dev) {
158                 const struct net_device_ops *ops = dev->netdev_ops;
159                 int err;
160                 struct ifreq ifr;
161                 struct ip_tunnel_parm p;
162                 struct in_device  *in_dev;
163
164                 memset(&p, 0, sizeof(p));
165                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
166                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
167                 p.iph.version = 4;
168                 p.iph.ihl = 5;
169                 p.iph.protocol = IPPROTO_IPIP;
170                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
171                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
172
173                 if (ops->ndo_do_ioctl) {
174                         mm_segment_t oldfs = get_fs();
175
176                         set_fs(KERNEL_DS);
177                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
178                         set_fs(oldfs);
179                 } else
180                         err = -EOPNOTSUPP;
181
182                 dev = NULL;
183
184                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
185                         dev->flags |= IFF_MULTICAST;
186
187                         in_dev = __in_dev_get_rtnl(dev);
188                         if (in_dev == NULL)
189                                 goto failure;
190
191                         ipv4_devconf_setall(in_dev);
192                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
193
194                         if (dev_open(dev))
195                                 goto failure;
196                         dev_hold(dev);
197                 }
198         }
199         return dev;
200
201 failure:
202         /* allow the register to be completed before unregistering. */
203         rtnl_unlock();
204         rtnl_lock();
205
206         unregister_netdevice(dev);
207         return NULL;
208 }
209
210 #ifdef CONFIG_IP_PIMSM
211
212 static int reg_vif_num = -1;
213
214 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
215 {
216         read_lock(&mrt_lock);
217         dev->stats.tx_bytes += skb->len;
218         dev->stats.tx_packets++;
219         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
220         read_unlock(&mrt_lock);
221         kfree_skb(skb);
222         return 0;
223 }
224
225 static const struct net_device_ops reg_vif_netdev_ops = {
226         .ndo_start_xmit = reg_vif_xmit,
227 };
228
229 static void reg_vif_setup(struct net_device *dev)
230 {
231         dev->type               = ARPHRD_PIMREG;
232         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
233         dev->flags              = IFF_NOARP;
234         dev->netdev_ops         = &reg_vif_netdev_ops,
235         dev->destructor         = free_netdev;
236 }
237
238 static struct net_device *ipmr_reg_vif(void)
239 {
240         struct net_device *dev;
241         struct in_device *in_dev;
242
243         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
244
245         if (dev == NULL)
246                 return NULL;
247
248         if (register_netdevice(dev)) {
249                 free_netdev(dev);
250                 return NULL;
251         }
252         dev->iflink = 0;
253
254         rcu_read_lock();
255         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
256                 rcu_read_unlock();
257                 goto failure;
258         }
259
260         ipv4_devconf_setall(in_dev);
261         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
262         rcu_read_unlock();
263
264         if (dev_open(dev))
265                 goto failure;
266
267         dev_hold(dev);
268
269         return dev;
270
271 failure:
272         /* allow the register to be completed before unregistering. */
273         rtnl_unlock();
274         rtnl_lock();
275
276         unregister_netdevice(dev);
277         return NULL;
278 }
279 #endif
280
281 /*
282  *      Delete a VIF entry
283  *      @notify: Set to 1, if the caller is a notifier_call
284  */
285
286 static int vif_delete(int vifi, int notify)
287 {
288         struct vif_device *v;
289         struct net_device *dev;
290         struct in_device *in_dev;
291
292         if (vifi < 0 || vifi >= maxvif)
293                 return -EADDRNOTAVAIL;
294
295         v = &vif_table[vifi];
296
297         write_lock_bh(&mrt_lock);
298         dev = v->dev;
299         v->dev = NULL;
300
301         if (!dev) {
302                 write_unlock_bh(&mrt_lock);
303                 return -EADDRNOTAVAIL;
304         }
305
306 #ifdef CONFIG_IP_PIMSM
307         if (vifi == reg_vif_num)
308                 reg_vif_num = -1;
309 #endif
310
311         if (vifi+1 == maxvif) {
312                 int tmp;
313                 for (tmp=vifi-1; tmp>=0; tmp--) {
314                         if (VIF_EXISTS(tmp))
315                                 break;
316                 }
317                 maxvif = tmp+1;
318         }
319
320         write_unlock_bh(&mrt_lock);
321
322         dev_set_allmulti(dev, -1);
323
324         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
325                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
326                 ip_rt_multicast_event(in_dev);
327         }
328
329         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
330                 unregister_netdevice(dev);
331
332         dev_put(dev);
333         return 0;
334 }
335
336 /* Destroy an unresolved cache entry, killing queued skbs
337    and reporting error to netlink readers.
338  */
339
340 static void ipmr_destroy_unres(struct mfc_cache *c)
341 {
342         struct sk_buff *skb;
343         struct nlmsgerr *e;
344
345         atomic_dec(&cache_resolve_queue_len);
346
347         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
348                 if (ip_hdr(skb)->version == 0) {
349                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
350                         nlh->nlmsg_type = NLMSG_ERROR;
351                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
352                         skb_trim(skb, nlh->nlmsg_len);
353                         e = NLMSG_DATA(nlh);
354                         e->error = -ETIMEDOUT;
355                         memset(&e->msg, 0, sizeof(e->msg));
356
357                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
358                 } else
359                         kfree_skb(skb);
360         }
361
362         kmem_cache_free(mrt_cachep, c);
363 }
364
365
366 /* Single timer process for all the unresolved queue. */
367
368 static void ipmr_expire_process(unsigned long dummy)
369 {
370         unsigned long now;
371         unsigned long expires;
372         struct mfc_cache *c, **cp;
373
374         if (!spin_trylock(&mfc_unres_lock)) {
375                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
376                 return;
377         }
378
379         if (atomic_read(&cache_resolve_queue_len) == 0)
380                 goto out;
381
382         now = jiffies;
383         expires = 10*HZ;
384         cp = &mfc_unres_queue;
385
386         while ((c=*cp) != NULL) {
387                 if (time_after(c->mfc_un.unres.expires, now)) {
388                         unsigned long interval = c->mfc_un.unres.expires - now;
389                         if (interval < expires)
390                                 expires = interval;
391                         cp = &c->next;
392                         continue;
393                 }
394
395                 *cp = c->next;
396
397                 ipmr_destroy_unres(c);
398         }
399
400         if (atomic_read(&cache_resolve_queue_len))
401                 mod_timer(&ipmr_expire_timer, jiffies + expires);
402
403 out:
404         spin_unlock(&mfc_unres_lock);
405 }
406
407 /* Fill oifs list. It is called under write locked mrt_lock. */
408
409 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
410 {
411         int vifi;
412
413         cache->mfc_un.res.minvif = MAXVIFS;
414         cache->mfc_un.res.maxvif = 0;
415         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
416
417         for (vifi=0; vifi<maxvif; vifi++) {
418                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
419                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
420                         if (cache->mfc_un.res.minvif > vifi)
421                                 cache->mfc_un.res.minvif = vifi;
422                         if (cache->mfc_un.res.maxvif <= vifi)
423                                 cache->mfc_un.res.maxvif = vifi + 1;
424                 }
425         }
426 }
427
428 static int vif_add(struct vifctl *vifc, int mrtsock)
429 {
430         int vifi = vifc->vifc_vifi;
431         struct vif_device *v = &vif_table[vifi];
432         struct net_device *dev;
433         struct in_device *in_dev;
434         int err;
435
436         /* Is vif busy ? */
437         if (VIF_EXISTS(vifi))
438                 return -EADDRINUSE;
439
440         switch (vifc->vifc_flags) {
441 #ifdef CONFIG_IP_PIMSM
442         case VIFF_REGISTER:
443                 /*
444                  * Special Purpose VIF in PIM
445                  * All the packets will be sent to the daemon
446                  */
447                 if (reg_vif_num >= 0)
448                         return -EADDRINUSE;
449                 dev = ipmr_reg_vif();
450                 if (!dev)
451                         return -ENOBUFS;
452                 err = dev_set_allmulti(dev, 1);
453                 if (err) {
454                         unregister_netdevice(dev);
455                         dev_put(dev);
456                         return err;
457                 }
458                 break;
459 #endif
460         case VIFF_TUNNEL:
461                 dev = ipmr_new_tunnel(vifc);
462                 if (!dev)
463                         return -ENOBUFS;
464                 err = dev_set_allmulti(dev, 1);
465                 if (err) {
466                         ipmr_del_tunnel(dev, vifc);
467                         dev_put(dev);
468                         return err;
469                 }
470                 break;
471         case 0:
472                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
473                 if (!dev)
474                         return -EADDRNOTAVAIL;
475                 err = dev_set_allmulti(dev, 1);
476                 if (err) {
477                         dev_put(dev);
478                         return err;
479                 }
480                 break;
481         default:
482                 return -EINVAL;
483         }
484
485         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
486                 return -EADDRNOTAVAIL;
487         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
488         ip_rt_multicast_event(in_dev);
489
490         /*
491          *      Fill in the VIF structures
492          */
493         v->rate_limit = vifc->vifc_rate_limit;
494         v->local = vifc->vifc_lcl_addr.s_addr;
495         v->remote = vifc->vifc_rmt_addr.s_addr;
496         v->flags = vifc->vifc_flags;
497         if (!mrtsock)
498                 v->flags |= VIFF_STATIC;
499         v->threshold = vifc->vifc_threshold;
500         v->bytes_in = 0;
501         v->bytes_out = 0;
502         v->pkt_in = 0;
503         v->pkt_out = 0;
504         v->link = dev->ifindex;
505         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
506                 v->link = dev->iflink;
507
508         /* And finish update writing critical data */
509         write_lock_bh(&mrt_lock);
510         v->dev = dev;
511 #ifdef CONFIG_IP_PIMSM
512         if (v->flags&VIFF_REGISTER)
513                 reg_vif_num = vifi;
514 #endif
515         if (vifi+1 > maxvif)
516                 maxvif = vifi+1;
517         write_unlock_bh(&mrt_lock);
518         return 0;
519 }
520
521 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
522 {
523         int line = MFC_HASH(mcastgrp, origin);
524         struct mfc_cache *c;
525
526         for (c=mfc_cache_array[line]; c; c = c->next) {
527                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
528                         break;
529         }
530         return c;
531 }
532
533 /*
534  *      Allocate a multicast cache entry
535  */
536 static struct mfc_cache *ipmr_cache_alloc(void)
537 {
538         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
539         if (c == NULL)
540                 return NULL;
541         c->mfc_un.res.minvif = MAXVIFS;
542         return c;
543 }
544
545 static struct mfc_cache *ipmr_cache_alloc_unres(void)
546 {
547         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
548         if (c == NULL)
549                 return NULL;
550         skb_queue_head_init(&c->mfc_un.unres.unresolved);
551         c->mfc_un.unres.expires = jiffies + 10*HZ;
552         return c;
553 }
554
555 /*
556  *      A cache entry has gone into a resolved state from queued
557  */
558
559 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
560 {
561         struct sk_buff *skb;
562         struct nlmsgerr *e;
563
564         /*
565          *      Play the pending entries through our router
566          */
567
568         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
569                 if (ip_hdr(skb)->version == 0) {
570                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
571
572                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
573                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
574                                                   (u8 *)nlh);
575                         } else {
576                                 nlh->nlmsg_type = NLMSG_ERROR;
577                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
578                                 skb_trim(skb, nlh->nlmsg_len);
579                                 e = NLMSG_DATA(nlh);
580                                 e->error = -EMSGSIZE;
581                                 memset(&e->msg, 0, sizeof(e->msg));
582                         }
583
584                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
585                 } else
586                         ip_mr_forward(skb, c, 0);
587         }
588 }
589
590 /*
591  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
592  *      expects the following bizarre scheme.
593  *
594  *      Called under mrt_lock.
595  */
596
597 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
598 {
599         struct sk_buff *skb;
600         const int ihl = ip_hdrlen(pkt);
601         struct igmphdr *igmp;
602         struct igmpmsg *msg;
603         int ret;
604
605 #ifdef CONFIG_IP_PIMSM
606         if (assert == IGMPMSG_WHOLEPKT)
607                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
608         else
609 #endif
610                 skb = alloc_skb(128, GFP_ATOMIC);
611
612         if (!skb)
613                 return -ENOBUFS;
614
615 #ifdef CONFIG_IP_PIMSM
616         if (assert == IGMPMSG_WHOLEPKT) {
617                 /* Ugly, but we have no choice with this interface.
618                    Duplicate old header, fix ihl, length etc.
619                    And all this only to mangle msg->im_msgtype and
620                    to set msg->im_mbz to "mbz" :-)
621                  */
622                 skb_push(skb, sizeof(struct iphdr));
623                 skb_reset_network_header(skb);
624                 skb_reset_transport_header(skb);
625                 msg = (struct igmpmsg *)skb_network_header(skb);
626                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
627                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
628                 msg->im_mbz = 0;
629                 msg->im_vif = reg_vif_num;
630                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
631                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
632                                              sizeof(struct iphdr));
633         } else
634 #endif
635         {
636
637         /*
638          *      Copy the IP header
639          */
640
641         skb->network_header = skb->tail;
642         skb_put(skb, ihl);
643         skb_copy_to_linear_data(skb, pkt->data, ihl);
644         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
645         msg = (struct igmpmsg *)skb_network_header(skb);
646         msg->im_vif = vifi;
647         skb->dst = dst_clone(pkt->dst);
648
649         /*
650          *      Add our header
651          */
652
653         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
654         igmp->type      =
655         msg->im_msgtype = assert;
656         igmp->code      =       0;
657         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
658         skb->transport_header = skb->network_header;
659         }
660
661         if (mroute_socket == NULL) {
662                 kfree_skb(skb);
663                 return -EINVAL;
664         }
665
666         /*
667          *      Deliver to mrouted
668          */
669         if ((ret = sock_queue_rcv_skb(mroute_socket, skb))<0) {
670                 if (net_ratelimit())
671                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
672                 kfree_skb(skb);
673         }
674
675         return ret;
676 }
677
678 /*
679  *      Queue a packet for resolution. It gets locked cache entry!
680  */
681
682 static int
683 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
684 {
685         int err;
686         struct mfc_cache *c;
687         const struct iphdr *iph = ip_hdr(skb);
688
689         spin_lock_bh(&mfc_unres_lock);
690         for (c=mfc_unres_queue; c; c=c->next) {
691                 if (c->mfc_mcastgrp == iph->daddr &&
692                     c->mfc_origin == iph->saddr)
693                         break;
694         }
695
696         if (c == NULL) {
697                 /*
698                  *      Create a new entry if allowable
699                  */
700
701                 if (atomic_read(&cache_resolve_queue_len) >= 10 ||
702                     (c=ipmr_cache_alloc_unres())==NULL) {
703                         spin_unlock_bh(&mfc_unres_lock);
704
705                         kfree_skb(skb);
706                         return -ENOBUFS;
707                 }
708
709                 /*
710                  *      Fill in the new cache entry
711                  */
712                 c->mfc_parent   = -1;
713                 c->mfc_origin   = iph->saddr;
714                 c->mfc_mcastgrp = iph->daddr;
715
716                 /*
717                  *      Reflect first query at mrouted.
718                  */
719                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
720                         /* If the report failed throw the cache entry
721                            out - Brad Parker
722                          */
723                         spin_unlock_bh(&mfc_unres_lock);
724
725                         kmem_cache_free(mrt_cachep, c);
726                         kfree_skb(skb);
727                         return err;
728                 }
729
730                 atomic_inc(&cache_resolve_queue_len);
731                 c->next = mfc_unres_queue;
732                 mfc_unres_queue = c;
733
734                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
735         }
736
737         /*
738          *      See if we can append the packet
739          */
740         if (c->mfc_un.unres.unresolved.qlen>3) {
741                 kfree_skb(skb);
742                 err = -ENOBUFS;
743         } else {
744                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
745                 err = 0;
746         }
747
748         spin_unlock_bh(&mfc_unres_lock);
749         return err;
750 }
751
752 /*
753  *      MFC cache manipulation by user space mroute daemon
754  */
755
756 static int ipmr_mfc_delete(struct mfcctl *mfc)
757 {
758         int line;
759         struct mfc_cache *c, **cp;
760
761         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
762
763         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
764                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
765                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
766                         write_lock_bh(&mrt_lock);
767                         *cp = c->next;
768                         write_unlock_bh(&mrt_lock);
769
770                         kmem_cache_free(mrt_cachep, c);
771                         return 0;
772                 }
773         }
774         return -ENOENT;
775 }
776
777 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
778 {
779         int line;
780         struct mfc_cache *uc, *c, **cp;
781
782         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
783
784         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
785                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
786                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
787                         break;
788         }
789
790         if (c != NULL) {
791                 write_lock_bh(&mrt_lock);
792                 c->mfc_parent = mfc->mfcc_parent;
793                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
794                 if (!mrtsock)
795                         c->mfc_flags |= MFC_STATIC;
796                 write_unlock_bh(&mrt_lock);
797                 return 0;
798         }
799
800         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
801                 return -EINVAL;
802
803         c = ipmr_cache_alloc();
804         if (c == NULL)
805                 return -ENOMEM;
806
807         c->mfc_origin = mfc->mfcc_origin.s_addr;
808         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
809         c->mfc_parent = mfc->mfcc_parent;
810         ipmr_update_thresholds(c, mfc->mfcc_ttls);
811         if (!mrtsock)
812                 c->mfc_flags |= MFC_STATIC;
813
814         write_lock_bh(&mrt_lock);
815         c->next = mfc_cache_array[line];
816         mfc_cache_array[line] = c;
817         write_unlock_bh(&mrt_lock);
818
819         /*
820          *      Check to see if we resolved a queued list. If so we
821          *      need to send on the frames and tidy up.
822          */
823         spin_lock_bh(&mfc_unres_lock);
824         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
825              cp = &uc->next) {
826                 if (uc->mfc_origin == c->mfc_origin &&
827                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
828                         *cp = uc->next;
829                         if (atomic_dec_and_test(&cache_resolve_queue_len))
830                                 del_timer(&ipmr_expire_timer);
831                         break;
832                 }
833         }
834         spin_unlock_bh(&mfc_unres_lock);
835
836         if (uc) {
837                 ipmr_cache_resolve(uc, c);
838                 kmem_cache_free(mrt_cachep, uc);
839         }
840         return 0;
841 }
842
843 /*
844  *      Close the multicast socket, and clear the vif tables etc
845  */
846
847 static void mroute_clean_tables(struct sock *sk)
848 {
849         int i;
850
851         /*
852          *      Shut down all active vif entries
853          */
854         for (i=0; i<maxvif; i++) {
855                 if (!(vif_table[i].flags&VIFF_STATIC))
856                         vif_delete(i, 0);
857         }
858
859         /*
860          *      Wipe the cache
861          */
862         for (i=0; i<MFC_LINES; i++) {
863                 struct mfc_cache *c, **cp;
864
865                 cp = &mfc_cache_array[i];
866                 while ((c = *cp) != NULL) {
867                         if (c->mfc_flags&MFC_STATIC) {
868                                 cp = &c->next;
869                                 continue;
870                         }
871                         write_lock_bh(&mrt_lock);
872                         *cp = c->next;
873                         write_unlock_bh(&mrt_lock);
874
875                         kmem_cache_free(mrt_cachep, c);
876                 }
877         }
878
879         if (atomic_read(&cache_resolve_queue_len) != 0) {
880                 struct mfc_cache *c;
881
882                 spin_lock_bh(&mfc_unres_lock);
883                 while (mfc_unres_queue != NULL) {
884                         c = mfc_unres_queue;
885                         mfc_unres_queue = c->next;
886                         spin_unlock_bh(&mfc_unres_lock);
887
888                         ipmr_destroy_unres(c);
889
890                         spin_lock_bh(&mfc_unres_lock);
891                 }
892                 spin_unlock_bh(&mfc_unres_lock);
893         }
894 }
895
896 static void mrtsock_destruct(struct sock *sk)
897 {
898         rtnl_lock();
899         if (sk == mroute_socket) {
900                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
901
902                 write_lock_bh(&mrt_lock);
903                 mroute_socket = NULL;
904                 write_unlock_bh(&mrt_lock);
905
906                 mroute_clean_tables(sk);
907         }
908         rtnl_unlock();
909 }
910
911 /*
912  *      Socket options and virtual interface manipulation. The whole
913  *      virtual interface system is a complete heap, but unfortunately
914  *      that's how BSD mrouted happens to think. Maybe one day with a proper
915  *      MOSPF/PIM router set up we can clean this up.
916  */
917
918 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
919 {
920         int ret;
921         struct vifctl vif;
922         struct mfcctl mfc;
923
924         if (optname != MRT_INIT) {
925                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
926                         return -EACCES;
927         }
928
929         switch (optname) {
930         case MRT_INIT:
931                 if (sk->sk_type != SOCK_RAW ||
932                     inet_sk(sk)->num != IPPROTO_IGMP)
933                         return -EOPNOTSUPP;
934                 if (optlen != sizeof(int))
935                         return -ENOPROTOOPT;
936
937                 rtnl_lock();
938                 if (mroute_socket) {
939                         rtnl_unlock();
940                         return -EADDRINUSE;
941                 }
942
943                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
944                 if (ret == 0) {
945                         write_lock_bh(&mrt_lock);
946                         mroute_socket = sk;
947                         write_unlock_bh(&mrt_lock);
948
949                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
950                 }
951                 rtnl_unlock();
952                 return ret;
953         case MRT_DONE:
954                 if (sk != mroute_socket)
955                         return -EACCES;
956                 return ip_ra_control(sk, 0, NULL);
957         case MRT_ADD_VIF:
958         case MRT_DEL_VIF:
959                 if (optlen != sizeof(vif))
960                         return -EINVAL;
961                 if (copy_from_user(&vif, optval, sizeof(vif)))
962                         return -EFAULT;
963                 if (vif.vifc_vifi >= MAXVIFS)
964                         return -ENFILE;
965                 rtnl_lock();
966                 if (optname == MRT_ADD_VIF) {
967                         ret = vif_add(&vif, sk==mroute_socket);
968                 } else {
969                         ret = vif_delete(vif.vifc_vifi, 0);
970                 }
971                 rtnl_unlock();
972                 return ret;
973
974                 /*
975                  *      Manipulate the forwarding caches. These live
976                  *      in a sort of kernel/user symbiosis.
977                  */
978         case MRT_ADD_MFC:
979         case MRT_DEL_MFC:
980                 if (optlen != sizeof(mfc))
981                         return -EINVAL;
982                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
983                         return -EFAULT;
984                 rtnl_lock();
985                 if (optname == MRT_DEL_MFC)
986                         ret = ipmr_mfc_delete(&mfc);
987                 else
988                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
989                 rtnl_unlock();
990                 return ret;
991                 /*
992                  *      Control PIM assert.
993                  */
994         case MRT_ASSERT:
995         {
996                 int v;
997                 if (get_user(v,(int __user *)optval))
998                         return -EFAULT;
999                 mroute_do_assert=(v)?1:0;
1000                 return 0;
1001         }
1002 #ifdef CONFIG_IP_PIMSM
1003         case MRT_PIM:
1004         {
1005                 int v;
1006
1007                 if (get_user(v,(int __user *)optval))
1008                         return -EFAULT;
1009                 v = (v) ? 1 : 0;
1010
1011                 rtnl_lock();
1012                 ret = 0;
1013                 if (v != mroute_do_pim) {
1014                         mroute_do_pim = v;
1015                         mroute_do_assert = v;
1016 #ifdef CONFIG_IP_PIMSM_V2
1017                         if (mroute_do_pim)
1018                                 ret = inet_add_protocol(&pim_protocol,
1019                                                         IPPROTO_PIM);
1020                         else
1021                                 ret = inet_del_protocol(&pim_protocol,
1022                                                         IPPROTO_PIM);
1023                         if (ret < 0)
1024                                 ret = -EAGAIN;
1025 #endif
1026                 }
1027                 rtnl_unlock();
1028                 return ret;
1029         }
1030 #endif
1031         /*
1032          *      Spurious command, or MRT_VERSION which you cannot
1033          *      set.
1034          */
1035         default:
1036                 return -ENOPROTOOPT;
1037         }
1038 }
1039
1040 /*
1041  *      Getsock opt support for the multicast routing system.
1042  */
1043
1044 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1045 {
1046         int olr;
1047         int val;
1048
1049         if (optname != MRT_VERSION &&
1050 #ifdef CONFIG_IP_PIMSM
1051            optname!=MRT_PIM &&
1052 #endif
1053            optname!=MRT_ASSERT)
1054                 return -ENOPROTOOPT;
1055
1056         if (get_user(olr, optlen))
1057                 return -EFAULT;
1058
1059         olr = min_t(unsigned int, olr, sizeof(int));
1060         if (olr < 0)
1061                 return -EINVAL;
1062
1063         if (put_user(olr, optlen))
1064                 return -EFAULT;
1065         if (optname == MRT_VERSION)
1066                 val = 0x0305;
1067 #ifdef CONFIG_IP_PIMSM
1068         else if (optname == MRT_PIM)
1069                 val = mroute_do_pim;
1070 #endif
1071         else
1072                 val = mroute_do_assert;
1073         if (copy_to_user(optval, &val, olr))
1074                 return -EFAULT;
1075         return 0;
1076 }
1077
1078 /*
1079  *      The IP multicast ioctl support routines.
1080  */
1081
1082 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1083 {
1084         struct sioc_sg_req sr;
1085         struct sioc_vif_req vr;
1086         struct vif_device *vif;
1087         struct mfc_cache *c;
1088
1089         switch (cmd) {
1090         case SIOCGETVIFCNT:
1091                 if (copy_from_user(&vr, arg, sizeof(vr)))
1092                         return -EFAULT;
1093                 if (vr.vifi >= maxvif)
1094                         return -EINVAL;
1095                 read_lock(&mrt_lock);
1096                 vif=&vif_table[vr.vifi];
1097                 if (VIF_EXISTS(vr.vifi))        {
1098                         vr.icount = vif->pkt_in;
1099                         vr.ocount = vif->pkt_out;
1100                         vr.ibytes = vif->bytes_in;
1101                         vr.obytes = vif->bytes_out;
1102                         read_unlock(&mrt_lock);
1103
1104                         if (copy_to_user(arg, &vr, sizeof(vr)))
1105                                 return -EFAULT;
1106                         return 0;
1107                 }
1108                 read_unlock(&mrt_lock);
1109                 return -EADDRNOTAVAIL;
1110         case SIOCGETSGCNT:
1111                 if (copy_from_user(&sr, arg, sizeof(sr)))
1112                         return -EFAULT;
1113
1114                 read_lock(&mrt_lock);
1115                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1116                 if (c) {
1117                         sr.pktcnt = c->mfc_un.res.pkt;
1118                         sr.bytecnt = c->mfc_un.res.bytes;
1119                         sr.wrong_if = c->mfc_un.res.wrong_if;
1120                         read_unlock(&mrt_lock);
1121
1122                         if (copy_to_user(arg, &sr, sizeof(sr)))
1123                                 return -EFAULT;
1124                         return 0;
1125                 }
1126                 read_unlock(&mrt_lock);
1127                 return -EADDRNOTAVAIL;
1128         default:
1129                 return -ENOIOCTLCMD;
1130         }
1131 }
1132
1133
1134 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1135 {
1136         struct net_device *dev = ptr;
1137         struct vif_device *v;
1138         int ct;
1139
1140         if (!net_eq(dev_net(dev), &init_net))
1141                 return NOTIFY_DONE;
1142
1143         if (event != NETDEV_UNREGISTER)
1144                 return NOTIFY_DONE;
1145         v=&vif_table[0];
1146         for (ct=0; ct<maxvif; ct++,v++) {
1147                 if (v->dev == dev)
1148                         vif_delete(ct, 1);
1149         }
1150         return NOTIFY_DONE;
1151 }
1152
1153
1154 static struct notifier_block ip_mr_notifier = {
1155         .notifier_call = ipmr_device_event,
1156 };
1157
1158 /*
1159  *      Encapsulate a packet by attaching a valid IPIP header to it.
1160  *      This avoids tunnel drivers and other mess and gives us the speed so
1161  *      important for multicast video.
1162  */
1163
1164 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1165 {
1166         struct iphdr *iph;
1167         struct iphdr *old_iph = ip_hdr(skb);
1168
1169         skb_push(skb, sizeof(struct iphdr));
1170         skb->transport_header = skb->network_header;
1171         skb_reset_network_header(skb);
1172         iph = ip_hdr(skb);
1173
1174         iph->version    =       4;
1175         iph->tos        =       old_iph->tos;
1176         iph->ttl        =       old_iph->ttl;
1177         iph->frag_off   =       0;
1178         iph->daddr      =       daddr;
1179         iph->saddr      =       saddr;
1180         iph->protocol   =       IPPROTO_IPIP;
1181         iph->ihl        =       5;
1182         iph->tot_len    =       htons(skb->len);
1183         ip_select_ident(iph, skb->dst, NULL);
1184         ip_send_check(iph);
1185
1186         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1187         nf_reset(skb);
1188 }
1189
1190 static inline int ipmr_forward_finish(struct sk_buff *skb)
1191 {
1192         struct ip_options * opt = &(IPCB(skb)->opt);
1193
1194         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1195
1196         if (unlikely(opt->optlen))
1197                 ip_forward_options(skb);
1198
1199         return dst_output(skb);
1200 }
1201
1202 /*
1203  *      Processing handlers for ipmr_forward
1204  */
1205
1206 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1207 {
1208         const struct iphdr *iph = ip_hdr(skb);
1209         struct vif_device *vif = &vif_table[vifi];
1210         struct net_device *dev;
1211         struct rtable *rt;
1212         int    encap = 0;
1213
1214         if (vif->dev == NULL)
1215                 goto out_free;
1216
1217 #ifdef CONFIG_IP_PIMSM
1218         if (vif->flags & VIFF_REGISTER) {
1219                 vif->pkt_out++;
1220                 vif->bytes_out += skb->len;
1221                 vif->dev->stats.tx_bytes += skb->len;
1222                 vif->dev->stats.tx_packets++;
1223                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1224                 kfree_skb(skb);
1225                 return;
1226         }
1227 #endif
1228
1229         if (vif->flags&VIFF_TUNNEL) {
1230                 struct flowi fl = { .oif = vif->link,
1231                                     .nl_u = { .ip4_u =
1232                                               { .daddr = vif->remote,
1233                                                 .saddr = vif->local,
1234                                                 .tos = RT_TOS(iph->tos) } },
1235                                     .proto = IPPROTO_IPIP };
1236                 if (ip_route_output_key(&init_net, &rt, &fl))
1237                         goto out_free;
1238                 encap = sizeof(struct iphdr);
1239         } else {
1240                 struct flowi fl = { .oif = vif->link,
1241                                     .nl_u = { .ip4_u =
1242                                               { .daddr = iph->daddr,
1243                                                 .tos = RT_TOS(iph->tos) } },
1244                                     .proto = IPPROTO_IPIP };
1245                 if (ip_route_output_key(&init_net, &rt, &fl))
1246                         goto out_free;
1247         }
1248
1249         dev = rt->u.dst.dev;
1250
1251         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1252                 /* Do not fragment multicasts. Alas, IPv4 does not
1253                    allow to send ICMP, so that packets will disappear
1254                    to blackhole.
1255                  */
1256
1257                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1258                 ip_rt_put(rt);
1259                 goto out_free;
1260         }
1261
1262         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1263
1264         if (skb_cow(skb, encap)) {
1265                 ip_rt_put(rt);
1266                 goto out_free;
1267         }
1268
1269         vif->pkt_out++;
1270         vif->bytes_out += skb->len;
1271
1272         dst_release(skb->dst);
1273         skb->dst = &rt->u.dst;
1274         ip_decrease_ttl(ip_hdr(skb));
1275
1276         /* FIXME: forward and output firewalls used to be called here.
1277          * What do we do with netfilter? -- RR */
1278         if (vif->flags & VIFF_TUNNEL) {
1279                 ip_encap(skb, vif->local, vif->remote);
1280                 /* FIXME: extra output firewall step used to be here. --RR */
1281                 vif->dev->stats.tx_packets++;
1282                 vif->dev->stats.tx_bytes += skb->len;
1283         }
1284
1285         IPCB(skb)->flags |= IPSKB_FORWARDED;
1286
1287         /*
1288          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1289          * not only before forwarding, but after forwarding on all output
1290          * interfaces. It is clear, if mrouter runs a multicasting
1291          * program, it should receive packets not depending to what interface
1292          * program is joined.
1293          * If we will not make it, the program will have to join on all
1294          * interfaces. On the other hand, multihoming host (or router, but
1295          * not mrouter) cannot join to more than one interface - it will
1296          * result in receiving multiple packets.
1297          */
1298         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1299                 ipmr_forward_finish);
1300         return;
1301
1302 out_free:
1303         kfree_skb(skb);
1304         return;
1305 }
1306
1307 static int ipmr_find_vif(struct net_device *dev)
1308 {
1309         int ct;
1310         for (ct=maxvif-1; ct>=0; ct--) {
1311                 if (vif_table[ct].dev == dev)
1312                         break;
1313         }
1314         return ct;
1315 }
1316
1317 /* "local" means that we should preserve one skb (for local delivery) */
1318
1319 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1320 {
1321         int psend = -1;
1322         int vif, ct;
1323
1324         vif = cache->mfc_parent;
1325         cache->mfc_un.res.pkt++;
1326         cache->mfc_un.res.bytes += skb->len;
1327
1328         /*
1329          * Wrong interface: drop packet and (maybe) send PIM assert.
1330          */
1331         if (vif_table[vif].dev != skb->dev) {
1332                 int true_vifi;
1333
1334                 if (skb->rtable->fl.iif == 0) {
1335                         /* It is our own packet, looped back.
1336                            Very complicated situation...
1337
1338                            The best workaround until routing daemons will be
1339                            fixed is not to redistribute packet, if it was
1340                            send through wrong interface. It means, that
1341                            multicast applications WILL NOT work for
1342                            (S,G), which have default multicast route pointing
1343                            to wrong oif. In any case, it is not a good
1344                            idea to use multicasting applications on router.
1345                          */
1346                         goto dont_forward;
1347                 }
1348
1349                 cache->mfc_un.res.wrong_if++;
1350                 true_vifi = ipmr_find_vif(skb->dev);
1351
1352                 if (true_vifi >= 0 && mroute_do_assert &&
1353                     /* pimsm uses asserts, when switching from RPT to SPT,
1354                        so that we cannot check that packet arrived on an oif.
1355                        It is bad, but otherwise we would need to move pretty
1356                        large chunk of pimd to kernel. Ough... --ANK
1357                      */
1358                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1359                     time_after(jiffies,
1360                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1361                         cache->mfc_un.res.last_assert = jiffies;
1362                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1363                 }
1364                 goto dont_forward;
1365         }
1366
1367         vif_table[vif].pkt_in++;
1368         vif_table[vif].bytes_in += skb->len;
1369
1370         /*
1371          *      Forward the frame
1372          */
1373         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1374                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1375                         if (psend != -1) {
1376                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1377                                 if (skb2)
1378                                         ipmr_queue_xmit(skb2, cache, psend);
1379                         }
1380                         psend = ct;
1381                 }
1382         }
1383         if (psend != -1) {
1384                 if (local) {
1385                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1386                         if (skb2)
1387                                 ipmr_queue_xmit(skb2, cache, psend);
1388                 } else {
1389                         ipmr_queue_xmit(skb, cache, psend);
1390                         return 0;
1391                 }
1392         }
1393
1394 dont_forward:
1395         if (!local)
1396                 kfree_skb(skb);
1397         return 0;
1398 }
1399
1400
1401 /*
1402  *      Multicast packets for forwarding arrive here
1403  */
1404
1405 int ip_mr_input(struct sk_buff *skb)
1406 {
1407         struct mfc_cache *cache;
1408         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1409
1410         /* Packet is looped back after forward, it should not be
1411            forwarded second time, but still can be delivered locally.
1412          */
1413         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1414                 goto dont_forward;
1415
1416         if (!local) {
1417                     if (IPCB(skb)->opt.router_alert) {
1418                             if (ip_call_ra_chain(skb))
1419                                     return 0;
1420                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1421                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1422                                Cisco IOS <= 11.2(8)) do not put router alert
1423                                option to IGMP packets destined to routable
1424                                groups. It is very bad, because it means
1425                                that we can forward NO IGMP messages.
1426                              */
1427                             read_lock(&mrt_lock);
1428                             if (mroute_socket) {
1429                                     nf_reset(skb);
1430                                     raw_rcv(mroute_socket, skb);
1431                                     read_unlock(&mrt_lock);
1432                                     return 0;
1433                             }
1434                             read_unlock(&mrt_lock);
1435                     }
1436         }
1437
1438         read_lock(&mrt_lock);
1439         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1440
1441         /*
1442          *      No usable cache entry
1443          */
1444         if (cache == NULL) {
1445                 int vif;
1446
1447                 if (local) {
1448                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1449                         ip_local_deliver(skb);
1450                         if (skb2 == NULL) {
1451                                 read_unlock(&mrt_lock);
1452                                 return -ENOBUFS;
1453                         }
1454                         skb = skb2;
1455                 }
1456
1457                 vif = ipmr_find_vif(skb->dev);
1458                 if (vif >= 0) {
1459                         int err = ipmr_cache_unresolved(vif, skb);
1460                         read_unlock(&mrt_lock);
1461
1462                         return err;
1463                 }
1464                 read_unlock(&mrt_lock);
1465                 kfree_skb(skb);
1466                 return -ENODEV;
1467         }
1468
1469         ip_mr_forward(skb, cache, local);
1470
1471         read_unlock(&mrt_lock);
1472
1473         if (local)
1474                 return ip_local_deliver(skb);
1475
1476         return 0;
1477
1478 dont_forward:
1479         if (local)
1480                 return ip_local_deliver(skb);
1481         kfree_skb(skb);
1482         return 0;
1483 }
1484
1485 #ifdef CONFIG_IP_PIMSM
1486 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1487 {
1488         struct net_device *reg_dev = NULL;
1489         struct iphdr *encap;
1490
1491         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1492         /*
1493            Check that:
1494            a. packet is really destinted to a multicast group
1495            b. packet is not a NULL-REGISTER
1496            c. packet is not truncated
1497          */
1498         if (!ipv4_is_multicast(encap->daddr) ||
1499             encap->tot_len == 0 ||
1500             ntohs(encap->tot_len) + pimlen > skb->len)
1501                 return 1;
1502
1503         read_lock(&mrt_lock);
1504         if (reg_vif_num >= 0)
1505                 reg_dev = vif_table[reg_vif_num].dev;
1506         if (reg_dev)
1507                 dev_hold(reg_dev);
1508         read_unlock(&mrt_lock);
1509
1510         if (reg_dev == NULL)
1511                 return 1;
1512
1513         skb->mac_header = skb->network_header;
1514         skb_pull(skb, (u8*)encap - skb->data);
1515         skb_reset_network_header(skb);
1516         skb->dev = reg_dev;
1517         skb->protocol = htons(ETH_P_IP);
1518         skb->ip_summed = 0;
1519         skb->pkt_type = PACKET_HOST;
1520         dst_release(skb->dst);
1521         skb->dst = NULL;
1522         reg_dev->stats.rx_bytes += skb->len;
1523         reg_dev->stats.rx_packets++;
1524         nf_reset(skb);
1525         netif_rx(skb);
1526         dev_put(reg_dev);
1527
1528         return 0;
1529 }
1530 #endif
1531
1532 #ifdef CONFIG_IP_PIMSM_V1
1533 /*
1534  * Handle IGMP messages of PIMv1
1535  */
1536
1537 int pim_rcv_v1(struct sk_buff * skb)
1538 {
1539         struct igmphdr *pim;
1540
1541         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1542                 goto drop;
1543
1544         pim = igmp_hdr(skb);
1545
1546         if (!mroute_do_pim ||
1547             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1548                 goto drop;
1549
1550         if (__pim_rcv(skb, sizeof(*pim))) {
1551 drop:
1552                 kfree_skb(skb);
1553         }
1554         return 0;
1555 }
1556 #endif
1557
1558 #ifdef CONFIG_IP_PIMSM_V2
1559 static int pim_rcv(struct sk_buff * skb)
1560 {
1561         struct pimreghdr *pim;
1562
1563         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1564                 goto drop;
1565
1566         pim = (struct pimreghdr *)skb_transport_header(skb);
1567         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1568             (pim->flags&PIM_NULL_REGISTER) ||
1569             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1570              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1571                 goto drop;
1572
1573         if (__pim_rcv(skb, sizeof(*pim))) {
1574 drop:
1575                 kfree_skb(skb);
1576         }
1577         return 0;
1578 }
1579 #endif
1580
1581 static int
1582 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1583 {
1584         int ct;
1585         struct rtnexthop *nhp;
1586         struct net_device *dev = vif_table[c->mfc_parent].dev;
1587         u8 *b = skb_tail_pointer(skb);
1588         struct rtattr *mp_head;
1589
1590         if (dev)
1591                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1592
1593         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1594
1595         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1596                 if (c->mfc_un.res.ttls[ct] < 255) {
1597                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1598                                 goto rtattr_failure;
1599                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1600                         nhp->rtnh_flags = 0;
1601                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1602                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1603                         nhp->rtnh_len = sizeof(*nhp);
1604                 }
1605         }
1606         mp_head->rta_type = RTA_MULTIPATH;
1607         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1608         rtm->rtm_type = RTN_MULTICAST;
1609         return 1;
1610
1611 rtattr_failure:
1612         nlmsg_trim(skb, b);
1613         return -EMSGSIZE;
1614 }
1615
1616 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1617 {
1618         int err;
1619         struct mfc_cache *cache;
1620         struct rtable *rt = skb->rtable;
1621
1622         read_lock(&mrt_lock);
1623         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1624
1625         if (cache == NULL) {
1626                 struct sk_buff *skb2;
1627                 struct iphdr *iph;
1628                 struct net_device *dev;
1629                 int vif;
1630
1631                 if (nowait) {
1632                         read_unlock(&mrt_lock);
1633                         return -EAGAIN;
1634                 }
1635
1636                 dev = skb->dev;
1637                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1638                         read_unlock(&mrt_lock);
1639                         return -ENODEV;
1640                 }
1641                 skb2 = skb_clone(skb, GFP_ATOMIC);
1642                 if (!skb2) {
1643                         read_unlock(&mrt_lock);
1644                         return -ENOMEM;
1645                 }
1646
1647                 skb_push(skb2, sizeof(struct iphdr));
1648                 skb_reset_network_header(skb2);
1649                 iph = ip_hdr(skb2);
1650                 iph->ihl = sizeof(struct iphdr) >> 2;
1651                 iph->saddr = rt->rt_src;
1652                 iph->daddr = rt->rt_dst;
1653                 iph->version = 0;
1654                 err = ipmr_cache_unresolved(vif, skb2);
1655                 read_unlock(&mrt_lock);
1656                 return err;
1657         }
1658
1659         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1660                 cache->mfc_flags |= MFC_NOTIFY;
1661         err = ipmr_fill_mroute(skb, cache, rtm);
1662         read_unlock(&mrt_lock);
1663         return err;
1664 }
1665
1666 #ifdef CONFIG_PROC_FS
1667 /*
1668  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1669  */
1670 struct ipmr_vif_iter {
1671         int ct;
1672 };
1673
1674 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1675                                            loff_t pos)
1676 {
1677         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1678                 if (!VIF_EXISTS(iter->ct))
1679                         continue;
1680                 if (pos-- == 0)
1681                         return &vif_table[iter->ct];
1682         }
1683         return NULL;
1684 }
1685
1686 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1687         __acquires(mrt_lock)
1688 {
1689         read_lock(&mrt_lock);
1690         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1691                 : SEQ_START_TOKEN;
1692 }
1693
1694 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1695 {
1696         struct ipmr_vif_iter *iter = seq->private;
1697
1698         ++*pos;
1699         if (v == SEQ_START_TOKEN)
1700                 return ipmr_vif_seq_idx(iter, 0);
1701
1702         while (++iter->ct < maxvif) {
1703                 if (!VIF_EXISTS(iter->ct))
1704                         continue;
1705                 return &vif_table[iter->ct];
1706         }
1707         return NULL;
1708 }
1709
1710 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1711         __releases(mrt_lock)
1712 {
1713         read_unlock(&mrt_lock);
1714 }
1715
1716 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1717 {
1718         if (v == SEQ_START_TOKEN) {
1719                 seq_puts(seq,
1720                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1721         } else {
1722                 const struct vif_device *vif = v;
1723                 const char *name =  vif->dev ? vif->dev->name : "none";
1724
1725                 seq_printf(seq,
1726                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1727                            vif - vif_table,
1728                            name, vif->bytes_in, vif->pkt_in,
1729                            vif->bytes_out, vif->pkt_out,
1730                            vif->flags, vif->local, vif->remote);
1731         }
1732         return 0;
1733 }
1734
1735 static const struct seq_operations ipmr_vif_seq_ops = {
1736         .start = ipmr_vif_seq_start,
1737         .next  = ipmr_vif_seq_next,
1738         .stop  = ipmr_vif_seq_stop,
1739         .show  = ipmr_vif_seq_show,
1740 };
1741
1742 static int ipmr_vif_open(struct inode *inode, struct file *file)
1743 {
1744         return seq_open_private(file, &ipmr_vif_seq_ops,
1745                         sizeof(struct ipmr_vif_iter));
1746 }
1747
1748 static const struct file_operations ipmr_vif_fops = {
1749         .owner   = THIS_MODULE,
1750         .open    = ipmr_vif_open,
1751         .read    = seq_read,
1752         .llseek  = seq_lseek,
1753         .release = seq_release_private,
1754 };
1755
1756 struct ipmr_mfc_iter {
1757         struct mfc_cache **cache;
1758         int ct;
1759 };
1760
1761
1762 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1763 {
1764         struct mfc_cache *mfc;
1765
1766         it->cache = mfc_cache_array;
1767         read_lock(&mrt_lock);
1768         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1769                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1770                         if (pos-- == 0)
1771                                 return mfc;
1772         read_unlock(&mrt_lock);
1773
1774         it->cache = &mfc_unres_queue;
1775         spin_lock_bh(&mfc_unres_lock);
1776         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1777                 if (pos-- == 0)
1778                         return mfc;
1779         spin_unlock_bh(&mfc_unres_lock);
1780
1781         it->cache = NULL;
1782         return NULL;
1783 }
1784
1785
1786 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1787 {
1788         struct ipmr_mfc_iter *it = seq->private;
1789         it->cache = NULL;
1790         it->ct = 0;
1791         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1792                 : SEQ_START_TOKEN;
1793 }
1794
1795 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1796 {
1797         struct mfc_cache *mfc = v;
1798         struct ipmr_mfc_iter *it = seq->private;
1799
1800         ++*pos;
1801
1802         if (v == SEQ_START_TOKEN)
1803                 return ipmr_mfc_seq_idx(seq->private, 0);
1804
1805         if (mfc->next)
1806                 return mfc->next;
1807
1808         if (it->cache == &mfc_unres_queue)
1809                 goto end_of_list;
1810
1811         BUG_ON(it->cache != mfc_cache_array);
1812
1813         while (++it->ct < MFC_LINES) {
1814                 mfc = mfc_cache_array[it->ct];
1815                 if (mfc)
1816                         return mfc;
1817         }
1818
1819         /* exhausted cache_array, show unresolved */
1820         read_unlock(&mrt_lock);
1821         it->cache = &mfc_unres_queue;
1822         it->ct = 0;
1823
1824         spin_lock_bh(&mfc_unres_lock);
1825         mfc = mfc_unres_queue;
1826         if (mfc)
1827                 return mfc;
1828
1829  end_of_list:
1830         spin_unlock_bh(&mfc_unres_lock);
1831         it->cache = NULL;
1832
1833         return NULL;
1834 }
1835
1836 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1837 {
1838         struct ipmr_mfc_iter *it = seq->private;
1839
1840         if (it->cache == &mfc_unres_queue)
1841                 spin_unlock_bh(&mfc_unres_lock);
1842         else if (it->cache == mfc_cache_array)
1843                 read_unlock(&mrt_lock);
1844 }
1845
1846 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1847 {
1848         int n;
1849
1850         if (v == SEQ_START_TOKEN) {
1851                 seq_puts(seq,
1852                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1853         } else {
1854                 const struct mfc_cache *mfc = v;
1855                 const struct ipmr_mfc_iter *it = seq->private;
1856
1857                 seq_printf(seq, "%08lX %08lX %-3hd",
1858                            (unsigned long) mfc->mfc_mcastgrp,
1859                            (unsigned long) mfc->mfc_origin,
1860                            mfc->mfc_parent);
1861
1862                 if (it->cache != &mfc_unres_queue) {
1863                         seq_printf(seq, " %8lu %8lu %8lu",
1864                                    mfc->mfc_un.res.pkt,
1865                                    mfc->mfc_un.res.bytes,
1866                                    mfc->mfc_un.res.wrong_if);
1867                         for (n = mfc->mfc_un.res.minvif;
1868                              n < mfc->mfc_un.res.maxvif; n++ ) {
1869                                 if (VIF_EXISTS(n)
1870                                    && mfc->mfc_un.res.ttls[n] < 255)
1871                                 seq_printf(seq,
1872                                            " %2d:%-3d",
1873                                            n, mfc->mfc_un.res.ttls[n]);
1874                         }
1875                 } else {
1876                         /* unresolved mfc_caches don't contain
1877                          * pkt, bytes and wrong_if values
1878                          */
1879                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1880                 }
1881                 seq_putc(seq, '\n');
1882         }
1883         return 0;
1884 }
1885
1886 static const struct seq_operations ipmr_mfc_seq_ops = {
1887         .start = ipmr_mfc_seq_start,
1888         .next  = ipmr_mfc_seq_next,
1889         .stop  = ipmr_mfc_seq_stop,
1890         .show  = ipmr_mfc_seq_show,
1891 };
1892
1893 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1894 {
1895         return seq_open_private(file, &ipmr_mfc_seq_ops,
1896                         sizeof(struct ipmr_mfc_iter));
1897 }
1898
1899 static const struct file_operations ipmr_mfc_fops = {
1900         .owner   = THIS_MODULE,
1901         .open    = ipmr_mfc_open,
1902         .read    = seq_read,
1903         .llseek  = seq_lseek,
1904         .release = seq_release_private,
1905 };
1906 #endif
1907
1908 #ifdef CONFIG_IP_PIMSM_V2
1909 static struct net_protocol pim_protocol = {
1910         .handler        =       pim_rcv,
1911 };
1912 #endif
1913
1914
1915 /*
1916  *      Setup for IP multicast routing
1917  */
1918
1919 int __init ip_mr_init(void)
1920 {
1921         int err;
1922
1923         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1924                                        sizeof(struct mfc_cache),
1925                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1926                                        NULL);
1927         if (!mrt_cachep)
1928                 return -ENOMEM;
1929
1930         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1931         err = register_netdevice_notifier(&ip_mr_notifier);
1932         if (err)
1933                 goto reg_notif_fail;
1934 #ifdef CONFIG_PROC_FS
1935         err = -ENOMEM;
1936         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1937                 goto proc_vif_fail;
1938         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1939                 goto proc_cache_fail;
1940 #endif
1941         return 0;
1942 #ifdef CONFIG_PROC_FS
1943 proc_cache_fail:
1944         proc_net_remove(&init_net, "ip_mr_vif");
1945 proc_vif_fail:
1946         unregister_netdevice_notifier(&ip_mr_notifier);
1947 #endif
1948 reg_notif_fail:
1949         del_timer(&ipmr_expire_timer);
1950         kmem_cache_destroy(mrt_cachep);
1951         return err;
1952 }