Merge git://git.infradead.org/~dedekind/ubi-2.6
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/net_namespace.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66 #include <net/netlink.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 static struct sock *mroute_socket;
73
74
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78
79 static DEFINE_RWLOCK(mrt_lock);
80
81 /*
82  *      Multicast router control variables
83  */
84
85 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
86 static int maxvif;
87
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89
90 static int mroute_do_assert;                            /* Set in PIM assert    */
91 static int mroute_do_pim;
92
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
94
95 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
97
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105
106    In this case data path is free of exclusive locks at all.
107  */
108
109 static struct kmem_cache *mrt_cachep __read_mostly;
110
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118
119 static struct timer_list ipmr_expire_timer;
120
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126         struct net_device  *dev;
127
128         dev = __dev_get_by_name(&init_net, "tunl0");
129
130         if (dev) {
131                 int err;
132                 struct ifreq ifr;
133                 mm_segment_t    oldfs;
134                 struct ip_tunnel_parm p;
135                 struct in_device  *in_dev;
136
137                 memset(&p, 0, sizeof(p));
138                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
139                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
140                 p.iph.version = 4;
141                 p.iph.ihl = 5;
142                 p.iph.protocol = IPPROTO_IPIP;
143                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
145
146                 oldfs = get_fs(); set_fs(KERNEL_DS);
147                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148                 set_fs(oldfs);
149
150                 dev = NULL;
151
152                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
153                         dev->flags |= IFF_MULTICAST;
154
155                         in_dev = __in_dev_get_rtnl(dev);
156                         if (in_dev == NULL)
157                                 goto failure;
158
159                         ipv4_devconf_setall(in_dev);
160                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
161
162                         if (dev_open(dev))
163                                 goto failure;
164                 }
165         }
166         return dev;
167
168 failure:
169         /* allow the register to be completed before unregistering. */
170         rtnl_unlock();
171         rtnl_lock();
172
173         unregister_netdevice(dev);
174         return NULL;
175 }
176
177 #ifdef CONFIG_IP_PIMSM
178
179 static int reg_vif_num = -1;
180
181 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
182 {
183         read_lock(&mrt_lock);
184         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
185         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
186         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
187         read_unlock(&mrt_lock);
188         kfree_skb(skb);
189         return 0;
190 }
191
192 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
193 {
194         return (struct net_device_stats*)netdev_priv(dev);
195 }
196
197 static void reg_vif_setup(struct net_device *dev)
198 {
199         dev->type               = ARPHRD_PIMREG;
200         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
201         dev->flags              = IFF_NOARP;
202         dev->hard_start_xmit    = reg_vif_xmit;
203         dev->get_stats          = reg_vif_get_stats;
204         dev->destructor         = free_netdev;
205 }
206
207 static struct net_device *ipmr_reg_vif(void)
208 {
209         struct net_device *dev;
210         struct in_device *in_dev;
211
212         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
213                            reg_vif_setup);
214
215         if (dev == NULL)
216                 return NULL;
217
218         if (register_netdevice(dev)) {
219                 free_netdev(dev);
220                 return NULL;
221         }
222         dev->iflink = 0;
223
224         rcu_read_lock();
225         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
226                 rcu_read_unlock();
227                 goto failure;
228         }
229
230         ipv4_devconf_setall(in_dev);
231         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
232         rcu_read_unlock();
233
234         if (dev_open(dev))
235                 goto failure;
236
237         return dev;
238
239 failure:
240         /* allow the register to be completed before unregistering. */
241         rtnl_unlock();
242         rtnl_lock();
243
244         unregister_netdevice(dev);
245         return NULL;
246 }
247 #endif
248
249 /*
250  *      Delete a VIF entry
251  */
252
253 static int vif_delete(int vifi)
254 {
255         struct vif_device *v;
256         struct net_device *dev;
257         struct in_device *in_dev;
258
259         if (vifi < 0 || vifi >= maxvif)
260                 return -EADDRNOTAVAIL;
261
262         v = &vif_table[vifi];
263
264         write_lock_bh(&mrt_lock);
265         dev = v->dev;
266         v->dev = NULL;
267
268         if (!dev) {
269                 write_unlock_bh(&mrt_lock);
270                 return -EADDRNOTAVAIL;
271         }
272
273 #ifdef CONFIG_IP_PIMSM
274         if (vifi == reg_vif_num)
275                 reg_vif_num = -1;
276 #endif
277
278         if (vifi+1 == maxvif) {
279                 int tmp;
280                 for (tmp=vifi-1; tmp>=0; tmp--) {
281                         if (VIF_EXISTS(tmp))
282                                 break;
283                 }
284                 maxvif = tmp+1;
285         }
286
287         write_unlock_bh(&mrt_lock);
288
289         dev_set_allmulti(dev, -1);
290
291         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
292                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
293                 ip_rt_multicast_event(in_dev);
294         }
295
296         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
297                 unregister_netdevice(dev);
298
299         dev_put(dev);
300         return 0;
301 }
302
303 /* Destroy an unresolved cache entry, killing queued skbs
304    and reporting error to netlink readers.
305  */
306
307 static void ipmr_destroy_unres(struct mfc_cache *c)
308 {
309         struct sk_buff *skb;
310         struct nlmsgerr *e;
311
312         atomic_dec(&cache_resolve_queue_len);
313
314         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
315                 if (ip_hdr(skb)->version == 0) {
316                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
317                         nlh->nlmsg_type = NLMSG_ERROR;
318                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
319                         skb_trim(skb, nlh->nlmsg_len);
320                         e = NLMSG_DATA(nlh);
321                         e->error = -ETIMEDOUT;
322                         memset(&e->msg, 0, sizeof(e->msg));
323
324                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
325                 } else
326                         kfree_skb(skb);
327         }
328
329         kmem_cache_free(mrt_cachep, c);
330 }
331
332
333 /* Single timer process for all the unresolved queue. */
334
335 static void ipmr_expire_process(unsigned long dummy)
336 {
337         unsigned long now;
338         unsigned long expires;
339         struct mfc_cache *c, **cp;
340
341         if (!spin_trylock(&mfc_unres_lock)) {
342                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
343                 return;
344         }
345
346         if (atomic_read(&cache_resolve_queue_len) == 0)
347                 goto out;
348
349         now = jiffies;
350         expires = 10*HZ;
351         cp = &mfc_unres_queue;
352
353         while ((c=*cp) != NULL) {
354                 if (time_after(c->mfc_un.unres.expires, now)) {
355                         unsigned long interval = c->mfc_un.unres.expires - now;
356                         if (interval < expires)
357                                 expires = interval;
358                         cp = &c->next;
359                         continue;
360                 }
361
362                 *cp = c->next;
363
364                 ipmr_destroy_unres(c);
365         }
366
367         if (atomic_read(&cache_resolve_queue_len))
368                 mod_timer(&ipmr_expire_timer, jiffies + expires);
369
370 out:
371         spin_unlock(&mfc_unres_lock);
372 }
373
374 /* Fill oifs list. It is called under write locked mrt_lock. */
375
376 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
377 {
378         int vifi;
379
380         cache->mfc_un.res.minvif = MAXVIFS;
381         cache->mfc_un.res.maxvif = 0;
382         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
383
384         for (vifi=0; vifi<maxvif; vifi++) {
385                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
386                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
387                         if (cache->mfc_un.res.minvif > vifi)
388                                 cache->mfc_un.res.minvif = vifi;
389                         if (cache->mfc_un.res.maxvif <= vifi)
390                                 cache->mfc_un.res.maxvif = vifi + 1;
391                 }
392         }
393 }
394
395 static int vif_add(struct vifctl *vifc, int mrtsock)
396 {
397         int vifi = vifc->vifc_vifi;
398         struct vif_device *v = &vif_table[vifi];
399         struct net_device *dev;
400         struct in_device *in_dev;
401
402         /* Is vif busy ? */
403         if (VIF_EXISTS(vifi))
404                 return -EADDRINUSE;
405
406         switch (vifc->vifc_flags) {
407 #ifdef CONFIG_IP_PIMSM
408         case VIFF_REGISTER:
409                 /*
410                  * Special Purpose VIF in PIM
411                  * All the packets will be sent to the daemon
412                  */
413                 if (reg_vif_num >= 0)
414                         return -EADDRINUSE;
415                 dev = ipmr_reg_vif();
416                 if (!dev)
417                         return -ENOBUFS;
418                 break;
419 #endif
420         case VIFF_TUNNEL:
421                 dev = ipmr_new_tunnel(vifc);
422                 if (!dev)
423                         return -ENOBUFS;
424                 break;
425         case 0:
426                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
427                 if (!dev)
428                         return -EADDRNOTAVAIL;
429                 dev_put(dev);
430                 break;
431         default:
432                 return -EINVAL;
433         }
434
435         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
436                 return -EADDRNOTAVAIL;
437         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
438         dev_set_allmulti(dev, +1);
439         ip_rt_multicast_event(in_dev);
440
441         /*
442          *      Fill in the VIF structures
443          */
444         v->rate_limit=vifc->vifc_rate_limit;
445         v->local=vifc->vifc_lcl_addr.s_addr;
446         v->remote=vifc->vifc_rmt_addr.s_addr;
447         v->flags=vifc->vifc_flags;
448         if (!mrtsock)
449                 v->flags |= VIFF_STATIC;
450         v->threshold=vifc->vifc_threshold;
451         v->bytes_in = 0;
452         v->bytes_out = 0;
453         v->pkt_in = 0;
454         v->pkt_out = 0;
455         v->link = dev->ifindex;
456         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
457                 v->link = dev->iflink;
458
459         /* And finish update writing critical data */
460         write_lock_bh(&mrt_lock);
461         dev_hold(dev);
462         v->dev=dev;
463 #ifdef CONFIG_IP_PIMSM
464         if (v->flags&VIFF_REGISTER)
465                 reg_vif_num = vifi;
466 #endif
467         if (vifi+1 > maxvif)
468                 maxvif = vifi+1;
469         write_unlock_bh(&mrt_lock);
470         return 0;
471 }
472
473 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
474 {
475         int line=MFC_HASH(mcastgrp,origin);
476         struct mfc_cache *c;
477
478         for (c=mfc_cache_array[line]; c; c = c->next) {
479                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
480                         break;
481         }
482         return c;
483 }
484
485 /*
486  *      Allocate a multicast cache entry
487  */
488 static struct mfc_cache *ipmr_cache_alloc(void)
489 {
490         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
491         if (c==NULL)
492                 return NULL;
493         c->mfc_un.res.minvif = MAXVIFS;
494         return c;
495 }
496
497 static struct mfc_cache *ipmr_cache_alloc_unres(void)
498 {
499         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
500         if (c==NULL)
501                 return NULL;
502         skb_queue_head_init(&c->mfc_un.unres.unresolved);
503         c->mfc_un.unres.expires = jiffies + 10*HZ;
504         return c;
505 }
506
507 /*
508  *      A cache entry has gone into a resolved state from queued
509  */
510
511 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
512 {
513         struct sk_buff *skb;
514         struct nlmsgerr *e;
515
516         /*
517          *      Play the pending entries through our router
518          */
519
520         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
521                 if (ip_hdr(skb)->version == 0) {
522                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
523
524                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
525                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
526                                                   (u8 *)nlh);
527                         } else {
528                                 nlh->nlmsg_type = NLMSG_ERROR;
529                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
530                                 skb_trim(skb, nlh->nlmsg_len);
531                                 e = NLMSG_DATA(nlh);
532                                 e->error = -EMSGSIZE;
533                                 memset(&e->msg, 0, sizeof(e->msg));
534                         }
535
536                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
537                 } else
538                         ip_mr_forward(skb, c, 0);
539         }
540 }
541
542 /*
543  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
544  *      expects the following bizarre scheme.
545  *
546  *      Called under mrt_lock.
547  */
548
549 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
550 {
551         struct sk_buff *skb;
552         const int ihl = ip_hdrlen(pkt);
553         struct igmphdr *igmp;
554         struct igmpmsg *msg;
555         int ret;
556
557 #ifdef CONFIG_IP_PIMSM
558         if (assert == IGMPMSG_WHOLEPKT)
559                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
560         else
561 #endif
562                 skb = alloc_skb(128, GFP_ATOMIC);
563
564         if (!skb)
565                 return -ENOBUFS;
566
567 #ifdef CONFIG_IP_PIMSM
568         if (assert == IGMPMSG_WHOLEPKT) {
569                 /* Ugly, but we have no choice with this interface.
570                    Duplicate old header, fix ihl, length etc.
571                    And all this only to mangle msg->im_msgtype and
572                    to set msg->im_mbz to "mbz" :-)
573                  */
574                 skb_push(skb, sizeof(struct iphdr));
575                 skb_reset_network_header(skb);
576                 skb_reset_transport_header(skb);
577                 msg = (struct igmpmsg *)skb_network_header(skb);
578                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
579                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
580                 msg->im_mbz = 0;
581                 msg->im_vif = reg_vif_num;
582                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
583                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
584                                              sizeof(struct iphdr));
585         } else
586 #endif
587         {
588
589         /*
590          *      Copy the IP header
591          */
592
593         skb->network_header = skb->tail;
594         skb_put(skb, ihl);
595         skb_copy_to_linear_data(skb, pkt->data, ihl);
596         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
597         msg = (struct igmpmsg *)skb_network_header(skb);
598         msg->im_vif = vifi;
599         skb->dst = dst_clone(pkt->dst);
600
601         /*
602          *      Add our header
603          */
604
605         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
606         igmp->type      =
607         msg->im_msgtype = assert;
608         igmp->code      =       0;
609         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
610         skb->transport_header = skb->network_header;
611         }
612
613         if (mroute_socket == NULL) {
614                 kfree_skb(skb);
615                 return -EINVAL;
616         }
617
618         /*
619          *      Deliver to mrouted
620          */
621         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
622                 if (net_ratelimit())
623                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
624                 kfree_skb(skb);
625         }
626
627         return ret;
628 }
629
630 /*
631  *      Queue a packet for resolution. It gets locked cache entry!
632  */
633
634 static int
635 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
636 {
637         int err;
638         struct mfc_cache *c;
639         const struct iphdr *iph = ip_hdr(skb);
640
641         spin_lock_bh(&mfc_unres_lock);
642         for (c=mfc_unres_queue; c; c=c->next) {
643                 if (c->mfc_mcastgrp == iph->daddr &&
644                     c->mfc_origin == iph->saddr)
645                         break;
646         }
647
648         if (c == NULL) {
649                 /*
650                  *      Create a new entry if allowable
651                  */
652
653                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
654                     (c=ipmr_cache_alloc_unres())==NULL) {
655                         spin_unlock_bh(&mfc_unres_lock);
656
657                         kfree_skb(skb);
658                         return -ENOBUFS;
659                 }
660
661                 /*
662                  *      Fill in the new cache entry
663                  */
664                 c->mfc_parent   = -1;
665                 c->mfc_origin   = iph->saddr;
666                 c->mfc_mcastgrp = iph->daddr;
667
668                 /*
669                  *      Reflect first query at mrouted.
670                  */
671                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
672                         /* If the report failed throw the cache entry
673                            out - Brad Parker
674                          */
675                         spin_unlock_bh(&mfc_unres_lock);
676
677                         kmem_cache_free(mrt_cachep, c);
678                         kfree_skb(skb);
679                         return err;
680                 }
681
682                 atomic_inc(&cache_resolve_queue_len);
683                 c->next = mfc_unres_queue;
684                 mfc_unres_queue = c;
685
686                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
687         }
688
689         /*
690          *      See if we can append the packet
691          */
692         if (c->mfc_un.unres.unresolved.qlen>3) {
693                 kfree_skb(skb);
694                 err = -ENOBUFS;
695         } else {
696                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
697                 err = 0;
698         }
699
700         spin_unlock_bh(&mfc_unres_lock);
701         return err;
702 }
703
704 /*
705  *      MFC cache manipulation by user space mroute daemon
706  */
707
708 static int ipmr_mfc_delete(struct mfcctl *mfc)
709 {
710         int line;
711         struct mfc_cache *c, **cp;
712
713         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714
715         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
718                         write_lock_bh(&mrt_lock);
719                         *cp = c->next;
720                         write_unlock_bh(&mrt_lock);
721
722                         kmem_cache_free(mrt_cachep, c);
723                         return 0;
724                 }
725         }
726         return -ENOENT;
727 }
728
729 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
730 {
731         int line;
732         struct mfc_cache *uc, *c, **cp;
733
734         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
735
736         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
737                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
738                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
739                         break;
740         }
741
742         if (c != NULL) {
743                 write_lock_bh(&mrt_lock);
744                 c->mfc_parent = mfc->mfcc_parent;
745                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
746                 if (!mrtsock)
747                         c->mfc_flags |= MFC_STATIC;
748                 write_unlock_bh(&mrt_lock);
749                 return 0;
750         }
751
752         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
753                 return -EINVAL;
754
755         c=ipmr_cache_alloc();
756         if (c==NULL)
757                 return -ENOMEM;
758
759         c->mfc_origin=mfc->mfcc_origin.s_addr;
760         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
761         c->mfc_parent=mfc->mfcc_parent;
762         ipmr_update_thresholds(c, mfc->mfcc_ttls);
763         if (!mrtsock)
764                 c->mfc_flags |= MFC_STATIC;
765
766         write_lock_bh(&mrt_lock);
767         c->next = mfc_cache_array[line];
768         mfc_cache_array[line] = c;
769         write_unlock_bh(&mrt_lock);
770
771         /*
772          *      Check to see if we resolved a queued list. If so we
773          *      need to send on the frames and tidy up.
774          */
775         spin_lock_bh(&mfc_unres_lock);
776         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
777              cp = &uc->next) {
778                 if (uc->mfc_origin == c->mfc_origin &&
779                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
780                         *cp = uc->next;
781                         if (atomic_dec_and_test(&cache_resolve_queue_len))
782                                 del_timer(&ipmr_expire_timer);
783                         break;
784                 }
785         }
786         spin_unlock_bh(&mfc_unres_lock);
787
788         if (uc) {
789                 ipmr_cache_resolve(uc, c);
790                 kmem_cache_free(mrt_cachep, uc);
791         }
792         return 0;
793 }
794
795 /*
796  *      Close the multicast socket, and clear the vif tables etc
797  */
798
799 static void mroute_clean_tables(struct sock *sk)
800 {
801         int i;
802
803         /*
804          *      Shut down all active vif entries
805          */
806         for (i=0; i<maxvif; i++) {
807                 if (!(vif_table[i].flags&VIFF_STATIC))
808                         vif_delete(i);
809         }
810
811         /*
812          *      Wipe the cache
813          */
814         for (i=0;i<MFC_LINES;i++) {
815                 struct mfc_cache *c, **cp;
816
817                 cp = &mfc_cache_array[i];
818                 while ((c = *cp) != NULL) {
819                         if (c->mfc_flags&MFC_STATIC) {
820                                 cp = &c->next;
821                                 continue;
822                         }
823                         write_lock_bh(&mrt_lock);
824                         *cp = c->next;
825                         write_unlock_bh(&mrt_lock);
826
827                         kmem_cache_free(mrt_cachep, c);
828                 }
829         }
830
831         if (atomic_read(&cache_resolve_queue_len) != 0) {
832                 struct mfc_cache *c;
833
834                 spin_lock_bh(&mfc_unres_lock);
835                 while (mfc_unres_queue != NULL) {
836                         c = mfc_unres_queue;
837                         mfc_unres_queue = c->next;
838                         spin_unlock_bh(&mfc_unres_lock);
839
840                         ipmr_destroy_unres(c);
841
842                         spin_lock_bh(&mfc_unres_lock);
843                 }
844                 spin_unlock_bh(&mfc_unres_lock);
845         }
846 }
847
848 static void mrtsock_destruct(struct sock *sk)
849 {
850         rtnl_lock();
851         if (sk == mroute_socket) {
852                 IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--;
853
854                 write_lock_bh(&mrt_lock);
855                 mroute_socket=NULL;
856                 write_unlock_bh(&mrt_lock);
857
858                 mroute_clean_tables(sk);
859         }
860         rtnl_unlock();
861 }
862
863 /*
864  *      Socket options and virtual interface manipulation. The whole
865  *      virtual interface system is a complete heap, but unfortunately
866  *      that's how BSD mrouted happens to think. Maybe one day with a proper
867  *      MOSPF/PIM router set up we can clean this up.
868  */
869
870 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
871 {
872         int ret;
873         struct vifctl vif;
874         struct mfcctl mfc;
875
876         if (optname != MRT_INIT) {
877                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
878                         return -EACCES;
879         }
880
881         switch (optname) {
882         case MRT_INIT:
883                 if (sk->sk_type != SOCK_RAW ||
884                     inet_sk(sk)->num != IPPROTO_IGMP)
885                         return -EOPNOTSUPP;
886                 if (optlen!=sizeof(int))
887                         return -ENOPROTOOPT;
888
889                 rtnl_lock();
890                 if (mroute_socket) {
891                         rtnl_unlock();
892                         return -EADDRINUSE;
893                 }
894
895                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
896                 if (ret == 0) {
897                         write_lock_bh(&mrt_lock);
898                         mroute_socket=sk;
899                         write_unlock_bh(&mrt_lock);
900
901                         IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)++;
902                 }
903                 rtnl_unlock();
904                 return ret;
905         case MRT_DONE:
906                 if (sk!=mroute_socket)
907                         return -EACCES;
908                 return ip_ra_control(sk, 0, NULL);
909         case MRT_ADD_VIF:
910         case MRT_DEL_VIF:
911                 if (optlen!=sizeof(vif))
912                         return -EINVAL;
913                 if (copy_from_user(&vif,optval,sizeof(vif)))
914                         return -EFAULT;
915                 if (vif.vifc_vifi >= MAXVIFS)
916                         return -ENFILE;
917                 rtnl_lock();
918                 if (optname==MRT_ADD_VIF) {
919                         ret = vif_add(&vif, sk==mroute_socket);
920                 } else {
921                         ret = vif_delete(vif.vifc_vifi);
922                 }
923                 rtnl_unlock();
924                 return ret;
925
926                 /*
927                  *      Manipulate the forwarding caches. These live
928                  *      in a sort of kernel/user symbiosis.
929                  */
930         case MRT_ADD_MFC:
931         case MRT_DEL_MFC:
932                 if (optlen!=sizeof(mfc))
933                         return -EINVAL;
934                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
935                         return -EFAULT;
936                 rtnl_lock();
937                 if (optname==MRT_DEL_MFC)
938                         ret = ipmr_mfc_delete(&mfc);
939                 else
940                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
941                 rtnl_unlock();
942                 return ret;
943                 /*
944                  *      Control PIM assert.
945                  */
946         case MRT_ASSERT:
947         {
948                 int v;
949                 if (get_user(v,(int __user *)optval))
950                         return -EFAULT;
951                 mroute_do_assert=(v)?1:0;
952                 return 0;
953         }
954 #ifdef CONFIG_IP_PIMSM
955         case MRT_PIM:
956         {
957                 int v;
958
959                 if (get_user(v,(int __user *)optval))
960                         return -EFAULT;
961                 v = (v) ? 1 : 0;
962
963                 rtnl_lock();
964                 ret = 0;
965                 if (v != mroute_do_pim) {
966                         mroute_do_pim = v;
967                         mroute_do_assert = v;
968 #ifdef CONFIG_IP_PIMSM_V2
969                         if (mroute_do_pim)
970                                 ret = inet_add_protocol(&pim_protocol,
971                                                         IPPROTO_PIM);
972                         else
973                                 ret = inet_del_protocol(&pim_protocol,
974                                                         IPPROTO_PIM);
975                         if (ret < 0)
976                                 ret = -EAGAIN;
977 #endif
978                 }
979                 rtnl_unlock();
980                 return ret;
981         }
982 #endif
983         /*
984          *      Spurious command, or MRT_VERSION which you cannot
985          *      set.
986          */
987         default:
988                 return -ENOPROTOOPT;
989         }
990 }
991
992 /*
993  *      Getsock opt support for the multicast routing system.
994  */
995
996 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
997 {
998         int olr;
999         int val;
1000
1001         if (optname!=MRT_VERSION &&
1002 #ifdef CONFIG_IP_PIMSM
1003            optname!=MRT_PIM &&
1004 #endif
1005            optname!=MRT_ASSERT)
1006                 return -ENOPROTOOPT;
1007
1008         if (get_user(olr, optlen))
1009                 return -EFAULT;
1010
1011         olr = min_t(unsigned int, olr, sizeof(int));
1012         if (olr < 0)
1013                 return -EINVAL;
1014
1015         if (put_user(olr,optlen))
1016                 return -EFAULT;
1017         if (optname==MRT_VERSION)
1018                 val=0x0305;
1019 #ifdef CONFIG_IP_PIMSM
1020         else if (optname==MRT_PIM)
1021                 val=mroute_do_pim;
1022 #endif
1023         else
1024                 val=mroute_do_assert;
1025         if (copy_to_user(optval,&val,olr))
1026                 return -EFAULT;
1027         return 0;
1028 }
1029
1030 /*
1031  *      The IP multicast ioctl support routines.
1032  */
1033
1034 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1035 {
1036         struct sioc_sg_req sr;
1037         struct sioc_vif_req vr;
1038         struct vif_device *vif;
1039         struct mfc_cache *c;
1040
1041         switch (cmd) {
1042         case SIOCGETVIFCNT:
1043                 if (copy_from_user(&vr,arg,sizeof(vr)))
1044                         return -EFAULT;
1045                 if (vr.vifi>=maxvif)
1046                         return -EINVAL;
1047                 read_lock(&mrt_lock);
1048                 vif=&vif_table[vr.vifi];
1049                 if (VIF_EXISTS(vr.vifi))        {
1050                         vr.icount=vif->pkt_in;
1051                         vr.ocount=vif->pkt_out;
1052                         vr.ibytes=vif->bytes_in;
1053                         vr.obytes=vif->bytes_out;
1054                         read_unlock(&mrt_lock);
1055
1056                         if (copy_to_user(arg,&vr,sizeof(vr)))
1057                                 return -EFAULT;
1058                         return 0;
1059                 }
1060                 read_unlock(&mrt_lock);
1061                 return -EADDRNOTAVAIL;
1062         case SIOCGETSGCNT:
1063                 if (copy_from_user(&sr,arg,sizeof(sr)))
1064                         return -EFAULT;
1065
1066                 read_lock(&mrt_lock);
1067                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1068                 if (c) {
1069                         sr.pktcnt = c->mfc_un.res.pkt;
1070                         sr.bytecnt = c->mfc_un.res.bytes;
1071                         sr.wrong_if = c->mfc_un.res.wrong_if;
1072                         read_unlock(&mrt_lock);
1073
1074                         if (copy_to_user(arg,&sr,sizeof(sr)))
1075                                 return -EFAULT;
1076                         return 0;
1077                 }
1078                 read_unlock(&mrt_lock);
1079                 return -EADDRNOTAVAIL;
1080         default:
1081                 return -ENOIOCTLCMD;
1082         }
1083 }
1084
1085
1086 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1087 {
1088         struct net_device *dev = ptr;
1089         struct vif_device *v;
1090         int ct;
1091
1092         if (dev->nd_net != &init_net)
1093                 return NOTIFY_DONE;
1094
1095         if (event != NETDEV_UNREGISTER)
1096                 return NOTIFY_DONE;
1097         v=&vif_table[0];
1098         for (ct=0;ct<maxvif;ct++,v++) {
1099                 if (v->dev==dev)
1100                         vif_delete(ct);
1101         }
1102         return NOTIFY_DONE;
1103 }
1104
1105
1106 static struct notifier_block ip_mr_notifier={
1107         .notifier_call = ipmr_device_event,
1108 };
1109
1110 /*
1111  *      Encapsulate a packet by attaching a valid IPIP header to it.
1112  *      This avoids tunnel drivers and other mess and gives us the speed so
1113  *      important for multicast video.
1114  */
1115
1116 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1117 {
1118         struct iphdr *iph;
1119         struct iphdr *old_iph = ip_hdr(skb);
1120
1121         skb_push(skb, sizeof(struct iphdr));
1122         skb->transport_header = skb->network_header;
1123         skb_reset_network_header(skb);
1124         iph = ip_hdr(skb);
1125
1126         iph->version    =       4;
1127         iph->tos        =       old_iph->tos;
1128         iph->ttl        =       old_iph->ttl;
1129         iph->frag_off   =       0;
1130         iph->daddr      =       daddr;
1131         iph->saddr      =       saddr;
1132         iph->protocol   =       IPPROTO_IPIP;
1133         iph->ihl        =       5;
1134         iph->tot_len    =       htons(skb->len);
1135         ip_select_ident(iph, skb->dst, NULL);
1136         ip_send_check(iph);
1137
1138         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1139         nf_reset(skb);
1140 }
1141
1142 static inline int ipmr_forward_finish(struct sk_buff *skb)
1143 {
1144         struct ip_options * opt = &(IPCB(skb)->opt);
1145
1146         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1147
1148         if (unlikely(opt->optlen))
1149                 ip_forward_options(skb);
1150
1151         return dst_output(skb);
1152 }
1153
1154 /*
1155  *      Processing handlers for ipmr_forward
1156  */
1157
1158 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1159 {
1160         const struct iphdr *iph = ip_hdr(skb);
1161         struct vif_device *vif = &vif_table[vifi];
1162         struct net_device *dev;
1163         struct rtable *rt;
1164         int    encap = 0;
1165
1166         if (vif->dev == NULL)
1167                 goto out_free;
1168
1169 #ifdef CONFIG_IP_PIMSM
1170         if (vif->flags & VIFF_REGISTER) {
1171                 vif->pkt_out++;
1172                 vif->bytes_out+=skb->len;
1173                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1174                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1175                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1176                 kfree_skb(skb);
1177                 return;
1178         }
1179 #endif
1180
1181         if (vif->flags&VIFF_TUNNEL) {
1182                 struct flowi fl = { .oif = vif->link,
1183                                     .nl_u = { .ip4_u =
1184                                               { .daddr = vif->remote,
1185                                                 .saddr = vif->local,
1186                                                 .tos = RT_TOS(iph->tos) } },
1187                                     .proto = IPPROTO_IPIP };
1188                 if (ip_route_output_key(&init_net, &rt, &fl))
1189                         goto out_free;
1190                 encap = sizeof(struct iphdr);
1191         } else {
1192                 struct flowi fl = { .oif = vif->link,
1193                                     .nl_u = { .ip4_u =
1194                                               { .daddr = iph->daddr,
1195                                                 .tos = RT_TOS(iph->tos) } },
1196                                     .proto = IPPROTO_IPIP };
1197                 if (ip_route_output_key(&init_net, &rt, &fl))
1198                         goto out_free;
1199         }
1200
1201         dev = rt->u.dst.dev;
1202
1203         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1204                 /* Do not fragment multicasts. Alas, IPv4 does not
1205                    allow to send ICMP, so that packets will disappear
1206                    to blackhole.
1207                  */
1208
1209                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1210                 ip_rt_put(rt);
1211                 goto out_free;
1212         }
1213
1214         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1215
1216         if (skb_cow(skb, encap)) {
1217                 ip_rt_put(rt);
1218                 goto out_free;
1219         }
1220
1221         vif->pkt_out++;
1222         vif->bytes_out+=skb->len;
1223
1224         dst_release(skb->dst);
1225         skb->dst = &rt->u.dst;
1226         ip_decrease_ttl(ip_hdr(skb));
1227
1228         /* FIXME: forward and output firewalls used to be called here.
1229          * What do we do with netfilter? -- RR */
1230         if (vif->flags & VIFF_TUNNEL) {
1231                 ip_encap(skb, vif->local, vif->remote);
1232                 /* FIXME: extra output firewall step used to be here. --RR */
1233                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1234                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1235         }
1236
1237         IPCB(skb)->flags |= IPSKB_FORWARDED;
1238
1239         /*
1240          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1241          * not only before forwarding, but after forwarding on all output
1242          * interfaces. It is clear, if mrouter runs a multicasting
1243          * program, it should receive packets not depending to what interface
1244          * program is joined.
1245          * If we will not make it, the program will have to join on all
1246          * interfaces. On the other hand, multihoming host (or router, but
1247          * not mrouter) cannot join to more than one interface - it will
1248          * result in receiving multiple packets.
1249          */
1250         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1251                 ipmr_forward_finish);
1252         return;
1253
1254 out_free:
1255         kfree_skb(skb);
1256         return;
1257 }
1258
1259 static int ipmr_find_vif(struct net_device *dev)
1260 {
1261         int ct;
1262         for (ct=maxvif-1; ct>=0; ct--) {
1263                 if (vif_table[ct].dev == dev)
1264                         break;
1265         }
1266         return ct;
1267 }
1268
1269 /* "local" means that we should preserve one skb (for local delivery) */
1270
1271 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1272 {
1273         int psend = -1;
1274         int vif, ct;
1275
1276         vif = cache->mfc_parent;
1277         cache->mfc_un.res.pkt++;
1278         cache->mfc_un.res.bytes += skb->len;
1279
1280         /*
1281          * Wrong interface: drop packet and (maybe) send PIM assert.
1282          */
1283         if (vif_table[vif].dev != skb->dev) {
1284                 int true_vifi;
1285
1286                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1287                         /* It is our own packet, looped back.
1288                            Very complicated situation...
1289
1290                            The best workaround until routing daemons will be
1291                            fixed is not to redistribute packet, if it was
1292                            send through wrong interface. It means, that
1293                            multicast applications WILL NOT work for
1294                            (S,G), which have default multicast route pointing
1295                            to wrong oif. In any case, it is not a good
1296                            idea to use multicasting applications on router.
1297                          */
1298                         goto dont_forward;
1299                 }
1300
1301                 cache->mfc_un.res.wrong_if++;
1302                 true_vifi = ipmr_find_vif(skb->dev);
1303
1304                 if (true_vifi >= 0 && mroute_do_assert &&
1305                     /* pimsm uses asserts, when switching from RPT to SPT,
1306                        so that we cannot check that packet arrived on an oif.
1307                        It is bad, but otherwise we would need to move pretty
1308                        large chunk of pimd to kernel. Ough... --ANK
1309                      */
1310                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1311                     time_after(jiffies,
1312                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1313                         cache->mfc_un.res.last_assert = jiffies;
1314                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1315                 }
1316                 goto dont_forward;
1317         }
1318
1319         vif_table[vif].pkt_in++;
1320         vif_table[vif].bytes_in+=skb->len;
1321
1322         /*
1323          *      Forward the frame
1324          */
1325         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1326                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1327                         if (psend != -1) {
1328                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1329                                 if (skb2)
1330                                         ipmr_queue_xmit(skb2, cache, psend);
1331                         }
1332                         psend=ct;
1333                 }
1334         }
1335         if (psend != -1) {
1336                 if (local) {
1337                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1338                         if (skb2)
1339                                 ipmr_queue_xmit(skb2, cache, psend);
1340                 } else {
1341                         ipmr_queue_xmit(skb, cache, psend);
1342                         return 0;
1343                 }
1344         }
1345
1346 dont_forward:
1347         if (!local)
1348                 kfree_skb(skb);
1349         return 0;
1350 }
1351
1352
1353 /*
1354  *      Multicast packets for forwarding arrive here
1355  */
1356
1357 int ip_mr_input(struct sk_buff *skb)
1358 {
1359         struct mfc_cache *cache;
1360         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1361
1362         /* Packet is looped back after forward, it should not be
1363            forwarded second time, but still can be delivered locally.
1364          */
1365         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1366                 goto dont_forward;
1367
1368         if (!local) {
1369                     if (IPCB(skb)->opt.router_alert) {
1370                             if (ip_call_ra_chain(skb))
1371                                     return 0;
1372                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1373                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1374                                Cisco IOS <= 11.2(8)) do not put router alert
1375                                option to IGMP packets destined to routable
1376                                groups. It is very bad, because it means
1377                                that we can forward NO IGMP messages.
1378                              */
1379                             read_lock(&mrt_lock);
1380                             if (mroute_socket) {
1381                                     nf_reset(skb);
1382                                     raw_rcv(mroute_socket, skb);
1383                                     read_unlock(&mrt_lock);
1384                                     return 0;
1385                             }
1386                             read_unlock(&mrt_lock);
1387                     }
1388         }
1389
1390         read_lock(&mrt_lock);
1391         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1392
1393         /*
1394          *      No usable cache entry
1395          */
1396         if (cache==NULL) {
1397                 int vif;
1398
1399                 if (local) {
1400                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1401                         ip_local_deliver(skb);
1402                         if (skb2 == NULL) {
1403                                 read_unlock(&mrt_lock);
1404                                 return -ENOBUFS;
1405                         }
1406                         skb = skb2;
1407                 }
1408
1409                 vif = ipmr_find_vif(skb->dev);
1410                 if (vif >= 0) {
1411                         int err = ipmr_cache_unresolved(vif, skb);
1412                         read_unlock(&mrt_lock);
1413
1414                         return err;
1415                 }
1416                 read_unlock(&mrt_lock);
1417                 kfree_skb(skb);
1418                 return -ENODEV;
1419         }
1420
1421         ip_mr_forward(skb, cache, local);
1422
1423         read_unlock(&mrt_lock);
1424
1425         if (local)
1426                 return ip_local_deliver(skb);
1427
1428         return 0;
1429
1430 dont_forward:
1431         if (local)
1432                 return ip_local_deliver(skb);
1433         kfree_skb(skb);
1434         return 0;
1435 }
1436
1437 #ifdef CONFIG_IP_PIMSM_V1
1438 /*
1439  * Handle IGMP messages of PIMv1
1440  */
1441
1442 int pim_rcv_v1(struct sk_buff * skb)
1443 {
1444         struct igmphdr *pim;
1445         struct iphdr   *encap;
1446         struct net_device  *reg_dev = NULL;
1447
1448         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1449                 goto drop;
1450
1451         pim = igmp_hdr(skb);
1452
1453         if (!mroute_do_pim ||
1454             skb->len < sizeof(*pim) + sizeof(*encap) ||
1455             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1456                 goto drop;
1457
1458         encap = (struct iphdr *)(skb_transport_header(skb) +
1459                                  sizeof(struct igmphdr));
1460         /*
1461            Check that:
1462            a. packet is really destinted to a multicast group
1463            b. packet is not a NULL-REGISTER
1464            c. packet is not truncated
1465          */
1466         if (!ipv4_is_multicast(encap->daddr) ||
1467             encap->tot_len == 0 ||
1468             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1469                 goto drop;
1470
1471         read_lock(&mrt_lock);
1472         if (reg_vif_num >= 0)
1473                 reg_dev = vif_table[reg_vif_num].dev;
1474         if (reg_dev)
1475                 dev_hold(reg_dev);
1476         read_unlock(&mrt_lock);
1477
1478         if (reg_dev == NULL)
1479                 goto drop;
1480
1481         skb->mac_header = skb->network_header;
1482         skb_pull(skb, (u8*)encap - skb->data);
1483         skb_reset_network_header(skb);
1484         skb->dev = reg_dev;
1485         skb->protocol = htons(ETH_P_IP);
1486         skb->ip_summed = 0;
1487         skb->pkt_type = PACKET_HOST;
1488         dst_release(skb->dst);
1489         skb->dst = NULL;
1490         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1491         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1492         nf_reset(skb);
1493         netif_rx(skb);
1494         dev_put(reg_dev);
1495         return 0;
1496  drop:
1497         kfree_skb(skb);
1498         return 0;
1499 }
1500 #endif
1501
1502 #ifdef CONFIG_IP_PIMSM_V2
1503 static int pim_rcv(struct sk_buff * skb)
1504 {
1505         struct pimreghdr *pim;
1506         struct iphdr   *encap;
1507         struct net_device  *reg_dev = NULL;
1508
1509         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1510                 goto drop;
1511
1512         pim = (struct pimreghdr *)skb_transport_header(skb);
1513         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1514             (pim->flags&PIM_NULL_REGISTER) ||
1515             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1516              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1517                 goto drop;
1518
1519         /* check if the inner packet is destined to mcast group */
1520         encap = (struct iphdr *)(skb_transport_header(skb) +
1521                                  sizeof(struct pimreghdr));
1522         if (!ipv4_is_multicast(encap->daddr) ||
1523             encap->tot_len == 0 ||
1524             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1525                 goto drop;
1526
1527         read_lock(&mrt_lock);
1528         if (reg_vif_num >= 0)
1529                 reg_dev = vif_table[reg_vif_num].dev;
1530         if (reg_dev)
1531                 dev_hold(reg_dev);
1532         read_unlock(&mrt_lock);
1533
1534         if (reg_dev == NULL)
1535                 goto drop;
1536
1537         skb->mac_header = skb->network_header;
1538         skb_pull(skb, (u8*)encap - skb->data);
1539         skb_reset_network_header(skb);
1540         skb->dev = reg_dev;
1541         skb->protocol = htons(ETH_P_IP);
1542         skb->ip_summed = 0;
1543         skb->pkt_type = PACKET_HOST;
1544         dst_release(skb->dst);
1545         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1546         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1547         skb->dst = NULL;
1548         nf_reset(skb);
1549         netif_rx(skb);
1550         dev_put(reg_dev);
1551         return 0;
1552  drop:
1553         kfree_skb(skb);
1554         return 0;
1555 }
1556 #endif
1557
1558 static int
1559 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1560 {
1561         int ct;
1562         struct rtnexthop *nhp;
1563         struct net_device *dev = vif_table[c->mfc_parent].dev;
1564         u8 *b = skb_tail_pointer(skb);
1565         struct rtattr *mp_head;
1566
1567         if (dev)
1568                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1569
1570         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1571
1572         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1573                 if (c->mfc_un.res.ttls[ct] < 255) {
1574                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1575                                 goto rtattr_failure;
1576                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1577                         nhp->rtnh_flags = 0;
1578                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1579                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1580                         nhp->rtnh_len = sizeof(*nhp);
1581                 }
1582         }
1583         mp_head->rta_type = RTA_MULTIPATH;
1584         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1585         rtm->rtm_type = RTN_MULTICAST;
1586         return 1;
1587
1588 rtattr_failure:
1589         nlmsg_trim(skb, b);
1590         return -EMSGSIZE;
1591 }
1592
1593 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1594 {
1595         int err;
1596         struct mfc_cache *cache;
1597         struct rtable *rt = (struct rtable*)skb->dst;
1598
1599         read_lock(&mrt_lock);
1600         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1601
1602         if (cache==NULL) {
1603                 struct sk_buff *skb2;
1604                 struct iphdr *iph;
1605                 struct net_device *dev;
1606                 int vif;
1607
1608                 if (nowait) {
1609                         read_unlock(&mrt_lock);
1610                         return -EAGAIN;
1611                 }
1612
1613                 dev = skb->dev;
1614                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1615                         read_unlock(&mrt_lock);
1616                         return -ENODEV;
1617                 }
1618                 skb2 = skb_clone(skb, GFP_ATOMIC);
1619                 if (!skb2) {
1620                         read_unlock(&mrt_lock);
1621                         return -ENOMEM;
1622                 }
1623
1624                 skb_push(skb2, sizeof(struct iphdr));
1625                 skb_reset_network_header(skb2);
1626                 iph = ip_hdr(skb2);
1627                 iph->ihl = sizeof(struct iphdr) >> 2;
1628                 iph->saddr = rt->rt_src;
1629                 iph->daddr = rt->rt_dst;
1630                 iph->version = 0;
1631                 err = ipmr_cache_unresolved(vif, skb2);
1632                 read_unlock(&mrt_lock);
1633                 return err;
1634         }
1635
1636         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1637                 cache->mfc_flags |= MFC_NOTIFY;
1638         err = ipmr_fill_mroute(skb, cache, rtm);
1639         read_unlock(&mrt_lock);
1640         return err;
1641 }
1642
1643 #ifdef CONFIG_PROC_FS
1644 /*
1645  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1646  */
1647 struct ipmr_vif_iter {
1648         int ct;
1649 };
1650
1651 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1652                                            loff_t pos)
1653 {
1654         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1655                 if (!VIF_EXISTS(iter->ct))
1656                         continue;
1657                 if (pos-- == 0)
1658                         return &vif_table[iter->ct];
1659         }
1660         return NULL;
1661 }
1662
1663 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1664         __acquires(mrt_lock)
1665 {
1666         read_lock(&mrt_lock);
1667         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1668                 : SEQ_START_TOKEN;
1669 }
1670
1671 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1672 {
1673         struct ipmr_vif_iter *iter = seq->private;
1674
1675         ++*pos;
1676         if (v == SEQ_START_TOKEN)
1677                 return ipmr_vif_seq_idx(iter, 0);
1678
1679         while (++iter->ct < maxvif) {
1680                 if (!VIF_EXISTS(iter->ct))
1681                         continue;
1682                 return &vif_table[iter->ct];
1683         }
1684         return NULL;
1685 }
1686
1687 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1688         __releases(mrt_lock)
1689 {
1690         read_unlock(&mrt_lock);
1691 }
1692
1693 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1694 {
1695         if (v == SEQ_START_TOKEN) {
1696                 seq_puts(seq,
1697                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1698         } else {
1699                 const struct vif_device *vif = v;
1700                 const char *name =  vif->dev ? vif->dev->name : "none";
1701
1702                 seq_printf(seq,
1703                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1704                            vif - vif_table,
1705                            name, vif->bytes_in, vif->pkt_in,
1706                            vif->bytes_out, vif->pkt_out,
1707                            vif->flags, vif->local, vif->remote);
1708         }
1709         return 0;
1710 }
1711
1712 static const struct seq_operations ipmr_vif_seq_ops = {
1713         .start = ipmr_vif_seq_start,
1714         .next  = ipmr_vif_seq_next,
1715         .stop  = ipmr_vif_seq_stop,
1716         .show  = ipmr_vif_seq_show,
1717 };
1718
1719 static int ipmr_vif_open(struct inode *inode, struct file *file)
1720 {
1721         return seq_open_private(file, &ipmr_vif_seq_ops,
1722                         sizeof(struct ipmr_vif_iter));
1723 }
1724
1725 static const struct file_operations ipmr_vif_fops = {
1726         .owner   = THIS_MODULE,
1727         .open    = ipmr_vif_open,
1728         .read    = seq_read,
1729         .llseek  = seq_lseek,
1730         .release = seq_release_private,
1731 };
1732
1733 struct ipmr_mfc_iter {
1734         struct mfc_cache **cache;
1735         int ct;
1736 };
1737
1738
1739 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1740 {
1741         struct mfc_cache *mfc;
1742
1743         it->cache = mfc_cache_array;
1744         read_lock(&mrt_lock);
1745         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1746                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1747                         if (pos-- == 0)
1748                                 return mfc;
1749         read_unlock(&mrt_lock);
1750
1751         it->cache = &mfc_unres_queue;
1752         spin_lock_bh(&mfc_unres_lock);
1753         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1754                 if (pos-- == 0)
1755                         return mfc;
1756         spin_unlock_bh(&mfc_unres_lock);
1757
1758         it->cache = NULL;
1759         return NULL;
1760 }
1761
1762
1763 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1764 {
1765         struct ipmr_mfc_iter *it = seq->private;
1766         it->cache = NULL;
1767         it->ct = 0;
1768         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1769                 : SEQ_START_TOKEN;
1770 }
1771
1772 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1773 {
1774         struct mfc_cache *mfc = v;
1775         struct ipmr_mfc_iter *it = seq->private;
1776
1777         ++*pos;
1778
1779         if (v == SEQ_START_TOKEN)
1780                 return ipmr_mfc_seq_idx(seq->private, 0);
1781
1782         if (mfc->next)
1783                 return mfc->next;
1784
1785         if (it->cache == &mfc_unres_queue)
1786                 goto end_of_list;
1787
1788         BUG_ON(it->cache != mfc_cache_array);
1789
1790         while (++it->ct < MFC_LINES) {
1791                 mfc = mfc_cache_array[it->ct];
1792                 if (mfc)
1793                         return mfc;
1794         }
1795
1796         /* exhausted cache_array, show unresolved */
1797         read_unlock(&mrt_lock);
1798         it->cache = &mfc_unres_queue;
1799         it->ct = 0;
1800
1801         spin_lock_bh(&mfc_unres_lock);
1802         mfc = mfc_unres_queue;
1803         if (mfc)
1804                 return mfc;
1805
1806  end_of_list:
1807         spin_unlock_bh(&mfc_unres_lock);
1808         it->cache = NULL;
1809
1810         return NULL;
1811 }
1812
1813 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1814 {
1815         struct ipmr_mfc_iter *it = seq->private;
1816
1817         if (it->cache == &mfc_unres_queue)
1818                 spin_unlock_bh(&mfc_unres_lock);
1819         else if (it->cache == mfc_cache_array)
1820                 read_unlock(&mrt_lock);
1821 }
1822
1823 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1824 {
1825         int n;
1826
1827         if (v == SEQ_START_TOKEN) {
1828                 seq_puts(seq,
1829                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1830         } else {
1831                 const struct mfc_cache *mfc = v;
1832                 const struct ipmr_mfc_iter *it = seq->private;
1833
1834                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1835                            (unsigned long) mfc->mfc_mcastgrp,
1836                            (unsigned long) mfc->mfc_origin,
1837                            mfc->mfc_parent,
1838                            mfc->mfc_un.res.pkt,
1839                            mfc->mfc_un.res.bytes,
1840                            mfc->mfc_un.res.wrong_if);
1841
1842                 if (it->cache != &mfc_unres_queue) {
1843                         for (n = mfc->mfc_un.res.minvif;
1844                              n < mfc->mfc_un.res.maxvif; n++ ) {
1845                                 if (VIF_EXISTS(n)
1846                                    && mfc->mfc_un.res.ttls[n] < 255)
1847                                 seq_printf(seq,
1848                                            " %2d:%-3d",
1849                                            n, mfc->mfc_un.res.ttls[n]);
1850                         }
1851                 }
1852                 seq_putc(seq, '\n');
1853         }
1854         return 0;
1855 }
1856
1857 static const struct seq_operations ipmr_mfc_seq_ops = {
1858         .start = ipmr_mfc_seq_start,
1859         .next  = ipmr_mfc_seq_next,
1860         .stop  = ipmr_mfc_seq_stop,
1861         .show  = ipmr_mfc_seq_show,
1862 };
1863
1864 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1865 {
1866         return seq_open_private(file, &ipmr_mfc_seq_ops,
1867                         sizeof(struct ipmr_mfc_iter));
1868 }
1869
1870 static const struct file_operations ipmr_mfc_fops = {
1871         .owner   = THIS_MODULE,
1872         .open    = ipmr_mfc_open,
1873         .read    = seq_read,
1874         .llseek  = seq_lseek,
1875         .release = seq_release_private,
1876 };
1877 #endif
1878
1879 #ifdef CONFIG_IP_PIMSM_V2
1880 static struct net_protocol pim_protocol = {
1881         .handler        =       pim_rcv,
1882 };
1883 #endif
1884
1885
1886 /*
1887  *      Setup for IP multicast routing
1888  */
1889
1890 void __init ip_mr_init(void)
1891 {
1892         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1893                                        sizeof(struct mfc_cache),
1894                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1895                                        NULL);
1896         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1897         register_netdevice_notifier(&ip_mr_notifier);
1898 #ifdef CONFIG_PROC_FS
1899         proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
1900         proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
1901 #endif
1902 }