mm: Remove slab destructors from kmem_cache_create().
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM 1
69 #endif
70
71 static struct sock *mroute_socket;
72
73
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77
78 static DEFINE_RWLOCK(mrt_lock);
79
80 /*
81  *      Multicast router control variables
82  */
83
84 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
85 static int maxvif;
86
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88
89 static int mroute_do_assert;                            /* Set in PIM assert    */
90 static int mroute_do_pim;
91
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
93
94 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
96
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104
105    In this case data path is free of exclusive locks at all.
106  */
107
108 static struct kmem_cache *mrt_cachep __read_mostly;
109
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117
118 static struct timer_list ipmr_expire_timer;
119
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125         struct net_device  *dev;
126
127         dev = __dev_get_by_name("tunl0");
128
129         if (dev) {
130                 int err;
131                 struct ifreq ifr;
132                 mm_segment_t    oldfs;
133                 struct ip_tunnel_parm p;
134                 struct in_device  *in_dev;
135
136                 memset(&p, 0, sizeof(p));
137                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
138                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139                 p.iph.version = 4;
140                 p.iph.ihl = 5;
141                 p.iph.protocol = IPPROTO_IPIP;
142                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143                 ifr.ifr_ifru.ifru_data = (void*)&p;
144
145                 oldfs = get_fs(); set_fs(KERNEL_DS);
146                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147                 set_fs(oldfs);
148
149                 dev = NULL;
150
151                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152                         dev->flags |= IFF_MULTICAST;
153
154                         in_dev = __in_dev_get_rtnl(dev);
155                         if (in_dev == NULL)
156                                 goto failure;
157
158                         ipv4_devconf_setall(in_dev);
159                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
160
161                         if (dev_open(dev))
162                                 goto failure;
163                 }
164         }
165         return dev;
166
167 failure:
168         /* allow the register to be completed before unregistering. */
169         rtnl_unlock();
170         rtnl_lock();
171
172         unregister_netdevice(dev);
173         return NULL;
174 }
175
176 #ifdef CONFIG_IP_PIMSM
177
178 static int reg_vif_num = -1;
179
180 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
181 {
182         read_lock(&mrt_lock);
183         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
184         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
185         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
186         read_unlock(&mrt_lock);
187         kfree_skb(skb);
188         return 0;
189 }
190
191 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
192 {
193         return (struct net_device_stats*)netdev_priv(dev);
194 }
195
196 static void reg_vif_setup(struct net_device *dev)
197 {
198         dev->type               = ARPHRD_PIMREG;
199         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
200         dev->flags              = IFF_NOARP;
201         dev->hard_start_xmit    = reg_vif_xmit;
202         dev->get_stats          = reg_vif_get_stats;
203         dev->destructor         = free_netdev;
204 }
205
206 static struct net_device *ipmr_reg_vif(void)
207 {
208         struct net_device *dev;
209         struct in_device *in_dev;
210
211         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
212                            reg_vif_setup);
213
214         if (dev == NULL)
215                 return NULL;
216
217         if (register_netdevice(dev)) {
218                 free_netdev(dev);
219                 return NULL;
220         }
221         dev->iflink = 0;
222
223         rcu_read_lock();
224         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
225                 rcu_read_unlock();
226                 goto failure;
227         }
228
229         ipv4_devconf_setall(in_dev);
230         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
231         rcu_read_unlock();
232
233         if (dev_open(dev))
234                 goto failure;
235
236         return dev;
237
238 failure:
239         /* allow the register to be completed before unregistering. */
240         rtnl_unlock();
241         rtnl_lock();
242
243         unregister_netdevice(dev);
244         return NULL;
245 }
246 #endif
247
248 /*
249  *      Delete a VIF entry
250  */
251
252 static int vif_delete(int vifi)
253 {
254         struct vif_device *v;
255         struct net_device *dev;
256         struct in_device *in_dev;
257
258         if (vifi < 0 || vifi >= maxvif)
259                 return -EADDRNOTAVAIL;
260
261         v = &vif_table[vifi];
262
263         write_lock_bh(&mrt_lock);
264         dev = v->dev;
265         v->dev = NULL;
266
267         if (!dev) {
268                 write_unlock_bh(&mrt_lock);
269                 return -EADDRNOTAVAIL;
270         }
271
272 #ifdef CONFIG_IP_PIMSM
273         if (vifi == reg_vif_num)
274                 reg_vif_num = -1;
275 #endif
276
277         if (vifi+1 == maxvif) {
278                 int tmp;
279                 for (tmp=vifi-1; tmp>=0; tmp--) {
280                         if (VIF_EXISTS(tmp))
281                                 break;
282                 }
283                 maxvif = tmp+1;
284         }
285
286         write_unlock_bh(&mrt_lock);
287
288         dev_set_allmulti(dev, -1);
289
290         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
291                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
292                 ip_rt_multicast_event(in_dev);
293         }
294
295         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
296                 unregister_netdevice(dev);
297
298         dev_put(dev);
299         return 0;
300 }
301
302 /* Destroy an unresolved cache entry, killing queued skbs
303    and reporting error to netlink readers.
304  */
305
306 static void ipmr_destroy_unres(struct mfc_cache *c)
307 {
308         struct sk_buff *skb;
309         struct nlmsgerr *e;
310
311         atomic_dec(&cache_resolve_queue_len);
312
313         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
314                 if (ip_hdr(skb)->version == 0) {
315                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
316                         nlh->nlmsg_type = NLMSG_ERROR;
317                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
318                         skb_trim(skb, nlh->nlmsg_len);
319                         e = NLMSG_DATA(nlh);
320                         e->error = -ETIMEDOUT;
321                         memset(&e->msg, 0, sizeof(e->msg));
322
323                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
324                 } else
325                         kfree_skb(skb);
326         }
327
328         kmem_cache_free(mrt_cachep, c);
329 }
330
331
332 /* Single timer process for all the unresolved queue. */
333
334 static void ipmr_expire_process(unsigned long dummy)
335 {
336         unsigned long now;
337         unsigned long expires;
338         struct mfc_cache *c, **cp;
339
340         if (!spin_trylock(&mfc_unres_lock)) {
341                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
342                 return;
343         }
344
345         if (atomic_read(&cache_resolve_queue_len) == 0)
346                 goto out;
347
348         now = jiffies;
349         expires = 10*HZ;
350         cp = &mfc_unres_queue;
351
352         while ((c=*cp) != NULL) {
353                 if (time_after(c->mfc_un.unres.expires, now)) {
354                         unsigned long interval = c->mfc_un.unres.expires - now;
355                         if (interval < expires)
356                                 expires = interval;
357                         cp = &c->next;
358                         continue;
359                 }
360
361                 *cp = c->next;
362
363                 ipmr_destroy_unres(c);
364         }
365
366         if (atomic_read(&cache_resolve_queue_len))
367                 mod_timer(&ipmr_expire_timer, jiffies + expires);
368
369 out:
370         spin_unlock(&mfc_unres_lock);
371 }
372
373 /* Fill oifs list. It is called under write locked mrt_lock. */
374
375 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
376 {
377         int vifi;
378
379         cache->mfc_un.res.minvif = MAXVIFS;
380         cache->mfc_un.res.maxvif = 0;
381         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
382
383         for (vifi=0; vifi<maxvif; vifi++) {
384                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
385                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
386                         if (cache->mfc_un.res.minvif > vifi)
387                                 cache->mfc_un.res.minvif = vifi;
388                         if (cache->mfc_un.res.maxvif <= vifi)
389                                 cache->mfc_un.res.maxvif = vifi + 1;
390                 }
391         }
392 }
393
394 static int vif_add(struct vifctl *vifc, int mrtsock)
395 {
396         int vifi = vifc->vifc_vifi;
397         struct vif_device *v = &vif_table[vifi];
398         struct net_device *dev;
399         struct in_device *in_dev;
400
401         /* Is vif busy ? */
402         if (VIF_EXISTS(vifi))
403                 return -EADDRINUSE;
404
405         switch (vifc->vifc_flags) {
406 #ifdef CONFIG_IP_PIMSM
407         case VIFF_REGISTER:
408                 /*
409                  * Special Purpose VIF in PIM
410                  * All the packets will be sent to the daemon
411                  */
412                 if (reg_vif_num >= 0)
413                         return -EADDRINUSE;
414                 dev = ipmr_reg_vif();
415                 if (!dev)
416                         return -ENOBUFS;
417                 break;
418 #endif
419         case VIFF_TUNNEL:
420                 dev = ipmr_new_tunnel(vifc);
421                 if (!dev)
422                         return -ENOBUFS;
423                 break;
424         case 0:
425                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
426                 if (!dev)
427                         return -EADDRNOTAVAIL;
428                 dev_put(dev);
429                 break;
430         default:
431                 return -EINVAL;
432         }
433
434         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
435                 return -EADDRNOTAVAIL;
436         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
437         dev_set_allmulti(dev, +1);
438         ip_rt_multicast_event(in_dev);
439
440         /*
441          *      Fill in the VIF structures
442          */
443         v->rate_limit=vifc->vifc_rate_limit;
444         v->local=vifc->vifc_lcl_addr.s_addr;
445         v->remote=vifc->vifc_rmt_addr.s_addr;
446         v->flags=vifc->vifc_flags;
447         if (!mrtsock)
448                 v->flags |= VIFF_STATIC;
449         v->threshold=vifc->vifc_threshold;
450         v->bytes_in = 0;
451         v->bytes_out = 0;
452         v->pkt_in = 0;
453         v->pkt_out = 0;
454         v->link = dev->ifindex;
455         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
456                 v->link = dev->iflink;
457
458         /* And finish update writing critical data */
459         write_lock_bh(&mrt_lock);
460         dev_hold(dev);
461         v->dev=dev;
462 #ifdef CONFIG_IP_PIMSM
463         if (v->flags&VIFF_REGISTER)
464                 reg_vif_num = vifi;
465 #endif
466         if (vifi+1 > maxvif)
467                 maxvif = vifi+1;
468         write_unlock_bh(&mrt_lock);
469         return 0;
470 }
471
472 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
473 {
474         int line=MFC_HASH(mcastgrp,origin);
475         struct mfc_cache *c;
476
477         for (c=mfc_cache_array[line]; c; c = c->next) {
478                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
479                         break;
480         }
481         return c;
482 }
483
484 /*
485  *      Allocate a multicast cache entry
486  */
487 static struct mfc_cache *ipmr_cache_alloc(void)
488 {
489         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
490         if (c==NULL)
491                 return NULL;
492         c->mfc_un.res.minvif = MAXVIFS;
493         return c;
494 }
495
496 static struct mfc_cache *ipmr_cache_alloc_unres(void)
497 {
498         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
499         if (c==NULL)
500                 return NULL;
501         skb_queue_head_init(&c->mfc_un.unres.unresolved);
502         c->mfc_un.unres.expires = jiffies + 10*HZ;
503         return c;
504 }
505
506 /*
507  *      A cache entry has gone into a resolved state from queued
508  */
509
510 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
511 {
512         struct sk_buff *skb;
513         struct nlmsgerr *e;
514
515         /*
516          *      Play the pending entries through our router
517          */
518
519         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
520                 if (ip_hdr(skb)->version == 0) {
521                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
522
523                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
524                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
525                                                   (u8 *)nlh);
526                         } else {
527                                 nlh->nlmsg_type = NLMSG_ERROR;
528                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
529                                 skb_trim(skb, nlh->nlmsg_len);
530                                 e = NLMSG_DATA(nlh);
531                                 e->error = -EMSGSIZE;
532                                 memset(&e->msg, 0, sizeof(e->msg));
533                         }
534
535                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
536                 } else
537                         ip_mr_forward(skb, c, 0);
538         }
539 }
540
541 /*
542  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
543  *      expects the following bizarre scheme.
544  *
545  *      Called under mrt_lock.
546  */
547
548 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
549 {
550         struct sk_buff *skb;
551         const int ihl = ip_hdrlen(pkt);
552         struct igmphdr *igmp;
553         struct igmpmsg *msg;
554         int ret;
555
556 #ifdef CONFIG_IP_PIMSM
557         if (assert == IGMPMSG_WHOLEPKT)
558                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
559         else
560 #endif
561                 skb = alloc_skb(128, GFP_ATOMIC);
562
563         if (!skb)
564                 return -ENOBUFS;
565
566 #ifdef CONFIG_IP_PIMSM
567         if (assert == IGMPMSG_WHOLEPKT) {
568                 /* Ugly, but we have no choice with this interface.
569                    Duplicate old header, fix ihl, length etc.
570                    And all this only to mangle msg->im_msgtype and
571                    to set msg->im_mbz to "mbz" :-)
572                  */
573                 skb_push(skb, sizeof(struct iphdr));
574                 skb_reset_network_header(skb);
575                 skb_reset_transport_header(skb);
576                 msg = (struct igmpmsg *)skb_network_header(skb);
577                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
578                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
579                 msg->im_mbz = 0;
580                 msg->im_vif = reg_vif_num;
581                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
582                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
583                                              sizeof(struct iphdr));
584         } else
585 #endif
586         {
587
588         /*
589          *      Copy the IP header
590          */
591
592         skb->network_header = skb->tail;
593         skb_put(skb, ihl);
594         skb_copy_to_linear_data(skb, pkt->data, ihl);
595         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
596         msg = (struct igmpmsg *)skb_network_header(skb);
597         msg->im_vif = vifi;
598         skb->dst = dst_clone(pkt->dst);
599
600         /*
601          *      Add our header
602          */
603
604         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
605         igmp->type      =
606         msg->im_msgtype = assert;
607         igmp->code      =       0;
608         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
609         skb->transport_header = skb->network_header;
610         }
611
612         if (mroute_socket == NULL) {
613                 kfree_skb(skb);
614                 return -EINVAL;
615         }
616
617         /*
618          *      Deliver to mrouted
619          */
620         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
621                 if (net_ratelimit())
622                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
623                 kfree_skb(skb);
624         }
625
626         return ret;
627 }
628
629 /*
630  *      Queue a packet for resolution. It gets locked cache entry!
631  */
632
633 static int
634 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
635 {
636         int err;
637         struct mfc_cache *c;
638         const struct iphdr *iph = ip_hdr(skb);
639
640         spin_lock_bh(&mfc_unres_lock);
641         for (c=mfc_unres_queue; c; c=c->next) {
642                 if (c->mfc_mcastgrp == iph->daddr &&
643                     c->mfc_origin == iph->saddr)
644                         break;
645         }
646
647         if (c == NULL) {
648                 /*
649                  *      Create a new entry if allowable
650                  */
651
652                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
653                     (c=ipmr_cache_alloc_unres())==NULL) {
654                         spin_unlock_bh(&mfc_unres_lock);
655
656                         kfree_skb(skb);
657                         return -ENOBUFS;
658                 }
659
660                 /*
661                  *      Fill in the new cache entry
662                  */
663                 c->mfc_parent   = -1;
664                 c->mfc_origin   = iph->saddr;
665                 c->mfc_mcastgrp = iph->daddr;
666
667                 /*
668                  *      Reflect first query at mrouted.
669                  */
670                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
671                         /* If the report failed throw the cache entry
672                            out - Brad Parker
673                          */
674                         spin_unlock_bh(&mfc_unres_lock);
675
676                         kmem_cache_free(mrt_cachep, c);
677                         kfree_skb(skb);
678                         return err;
679                 }
680
681                 atomic_inc(&cache_resolve_queue_len);
682                 c->next = mfc_unres_queue;
683                 mfc_unres_queue = c;
684
685                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
686         }
687
688         /*
689          *      See if we can append the packet
690          */
691         if (c->mfc_un.unres.unresolved.qlen>3) {
692                 kfree_skb(skb);
693                 err = -ENOBUFS;
694         } else {
695                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
696                 err = 0;
697         }
698
699         spin_unlock_bh(&mfc_unres_lock);
700         return err;
701 }
702
703 /*
704  *      MFC cache manipulation by user space mroute daemon
705  */
706
707 static int ipmr_mfc_delete(struct mfcctl *mfc)
708 {
709         int line;
710         struct mfc_cache *c, **cp;
711
712         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
713
714         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
715                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
716                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
717                         write_lock_bh(&mrt_lock);
718                         *cp = c->next;
719                         write_unlock_bh(&mrt_lock);
720
721                         kmem_cache_free(mrt_cachep, c);
722                         return 0;
723                 }
724         }
725         return -ENOENT;
726 }
727
728 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
729 {
730         int line;
731         struct mfc_cache *uc, *c, **cp;
732
733         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
734
735         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
736                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
737                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
738                         break;
739         }
740
741         if (c != NULL) {
742                 write_lock_bh(&mrt_lock);
743                 c->mfc_parent = mfc->mfcc_parent;
744                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
745                 if (!mrtsock)
746                         c->mfc_flags |= MFC_STATIC;
747                 write_unlock_bh(&mrt_lock);
748                 return 0;
749         }
750
751         if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
752                 return -EINVAL;
753
754         c=ipmr_cache_alloc();
755         if (c==NULL)
756                 return -ENOMEM;
757
758         c->mfc_origin=mfc->mfcc_origin.s_addr;
759         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
760         c->mfc_parent=mfc->mfcc_parent;
761         ipmr_update_thresholds(c, mfc->mfcc_ttls);
762         if (!mrtsock)
763                 c->mfc_flags |= MFC_STATIC;
764
765         write_lock_bh(&mrt_lock);
766         c->next = mfc_cache_array[line];
767         mfc_cache_array[line] = c;
768         write_unlock_bh(&mrt_lock);
769
770         /*
771          *      Check to see if we resolved a queued list. If so we
772          *      need to send on the frames and tidy up.
773          */
774         spin_lock_bh(&mfc_unres_lock);
775         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
776              cp = &uc->next) {
777                 if (uc->mfc_origin == c->mfc_origin &&
778                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
779                         *cp = uc->next;
780                         if (atomic_dec_and_test(&cache_resolve_queue_len))
781                                 del_timer(&ipmr_expire_timer);
782                         break;
783                 }
784         }
785         spin_unlock_bh(&mfc_unres_lock);
786
787         if (uc) {
788                 ipmr_cache_resolve(uc, c);
789                 kmem_cache_free(mrt_cachep, uc);
790         }
791         return 0;
792 }
793
794 /*
795  *      Close the multicast socket, and clear the vif tables etc
796  */
797
798 static void mroute_clean_tables(struct sock *sk)
799 {
800         int i;
801
802         /*
803          *      Shut down all active vif entries
804          */
805         for (i=0; i<maxvif; i++) {
806                 if (!(vif_table[i].flags&VIFF_STATIC))
807                         vif_delete(i);
808         }
809
810         /*
811          *      Wipe the cache
812          */
813         for (i=0;i<MFC_LINES;i++) {
814                 struct mfc_cache *c, **cp;
815
816                 cp = &mfc_cache_array[i];
817                 while ((c = *cp) != NULL) {
818                         if (c->mfc_flags&MFC_STATIC) {
819                                 cp = &c->next;
820                                 continue;
821                         }
822                         write_lock_bh(&mrt_lock);
823                         *cp = c->next;
824                         write_unlock_bh(&mrt_lock);
825
826                         kmem_cache_free(mrt_cachep, c);
827                 }
828         }
829
830         if (atomic_read(&cache_resolve_queue_len) != 0) {
831                 struct mfc_cache *c;
832
833                 spin_lock_bh(&mfc_unres_lock);
834                 while (mfc_unres_queue != NULL) {
835                         c = mfc_unres_queue;
836                         mfc_unres_queue = c->next;
837                         spin_unlock_bh(&mfc_unres_lock);
838
839                         ipmr_destroy_unres(c);
840
841                         spin_lock_bh(&mfc_unres_lock);
842                 }
843                 spin_unlock_bh(&mfc_unres_lock);
844         }
845 }
846
847 static void mrtsock_destruct(struct sock *sk)
848 {
849         rtnl_lock();
850         if (sk == mroute_socket) {
851                 IPV4_DEVCONF_ALL(MC_FORWARDING)--;
852
853                 write_lock_bh(&mrt_lock);
854                 mroute_socket=NULL;
855                 write_unlock_bh(&mrt_lock);
856
857                 mroute_clean_tables(sk);
858         }
859         rtnl_unlock();
860 }
861
862 /*
863  *      Socket options and virtual interface manipulation. The whole
864  *      virtual interface system is a complete heap, but unfortunately
865  *      that's how BSD mrouted happens to think. Maybe one day with a proper
866  *      MOSPF/PIM router set up we can clean this up.
867  */
868
869 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
870 {
871         int ret;
872         struct vifctl vif;
873         struct mfcctl mfc;
874
875         if (optname != MRT_INIT) {
876                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
877                         return -EACCES;
878         }
879
880         switch (optname) {
881         case MRT_INIT:
882                 if (sk->sk_type != SOCK_RAW ||
883                     inet_sk(sk)->num != IPPROTO_IGMP)
884                         return -EOPNOTSUPP;
885                 if (optlen!=sizeof(int))
886                         return -ENOPROTOOPT;
887
888                 rtnl_lock();
889                 if (mroute_socket) {
890                         rtnl_unlock();
891                         return -EADDRINUSE;
892                 }
893
894                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
895                 if (ret == 0) {
896                         write_lock_bh(&mrt_lock);
897                         mroute_socket=sk;
898                         write_unlock_bh(&mrt_lock);
899
900                         IPV4_DEVCONF_ALL(MC_FORWARDING)++;
901                 }
902                 rtnl_unlock();
903                 return ret;
904         case MRT_DONE:
905                 if (sk!=mroute_socket)
906                         return -EACCES;
907                 return ip_ra_control(sk, 0, NULL);
908         case MRT_ADD_VIF:
909         case MRT_DEL_VIF:
910                 if (optlen!=sizeof(vif))
911                         return -EINVAL;
912                 if (copy_from_user(&vif,optval,sizeof(vif)))
913                         return -EFAULT;
914                 if (vif.vifc_vifi >= MAXVIFS)
915                         return -ENFILE;
916                 rtnl_lock();
917                 if (optname==MRT_ADD_VIF) {
918                         ret = vif_add(&vif, sk==mroute_socket);
919                 } else {
920                         ret = vif_delete(vif.vifc_vifi);
921                 }
922                 rtnl_unlock();
923                 return ret;
924
925                 /*
926                  *      Manipulate the forwarding caches. These live
927                  *      in a sort of kernel/user symbiosis.
928                  */
929         case MRT_ADD_MFC:
930         case MRT_DEL_MFC:
931                 if (optlen!=sizeof(mfc))
932                         return -EINVAL;
933                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
934                         return -EFAULT;
935                 rtnl_lock();
936                 if (optname==MRT_DEL_MFC)
937                         ret = ipmr_mfc_delete(&mfc);
938                 else
939                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
940                 rtnl_unlock();
941                 return ret;
942                 /*
943                  *      Control PIM assert.
944                  */
945         case MRT_ASSERT:
946         {
947                 int v;
948                 if (get_user(v,(int __user *)optval))
949                         return -EFAULT;
950                 mroute_do_assert=(v)?1:0;
951                 return 0;
952         }
953 #ifdef CONFIG_IP_PIMSM
954         case MRT_PIM:
955         {
956                 int v, ret;
957                 if (get_user(v,(int __user *)optval))
958                         return -EFAULT;
959                 v = (v)?1:0;
960                 rtnl_lock();
961                 ret = 0;
962                 if (v != mroute_do_pim) {
963                         mroute_do_pim = v;
964                         mroute_do_assert = v;
965 #ifdef CONFIG_IP_PIMSM_V2
966                         if (mroute_do_pim)
967                                 ret = inet_add_protocol(&pim_protocol,
968                                                         IPPROTO_PIM);
969                         else
970                                 ret = inet_del_protocol(&pim_protocol,
971                                                         IPPROTO_PIM);
972                         if (ret < 0)
973                                 ret = -EAGAIN;
974 #endif
975                 }
976                 rtnl_unlock();
977                 return ret;
978         }
979 #endif
980         /*
981          *      Spurious command, or MRT_VERSION which you cannot
982          *      set.
983          */
984         default:
985                 return -ENOPROTOOPT;
986         }
987 }
988
989 /*
990  *      Getsock opt support for the multicast routing system.
991  */
992
993 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
994 {
995         int olr;
996         int val;
997
998         if (optname!=MRT_VERSION &&
999 #ifdef CONFIG_IP_PIMSM
1000            optname!=MRT_PIM &&
1001 #endif
1002            optname!=MRT_ASSERT)
1003                 return -ENOPROTOOPT;
1004
1005         if (get_user(olr, optlen))
1006                 return -EFAULT;
1007
1008         olr = min_t(unsigned int, olr, sizeof(int));
1009         if (olr < 0)
1010                 return -EINVAL;
1011
1012         if (put_user(olr,optlen))
1013                 return -EFAULT;
1014         if (optname==MRT_VERSION)
1015                 val=0x0305;
1016 #ifdef CONFIG_IP_PIMSM
1017         else if (optname==MRT_PIM)
1018                 val=mroute_do_pim;
1019 #endif
1020         else
1021                 val=mroute_do_assert;
1022         if (copy_to_user(optval,&val,olr))
1023                 return -EFAULT;
1024         return 0;
1025 }
1026
1027 /*
1028  *      The IP multicast ioctl support routines.
1029  */
1030
1031 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1032 {
1033         struct sioc_sg_req sr;
1034         struct sioc_vif_req vr;
1035         struct vif_device *vif;
1036         struct mfc_cache *c;
1037
1038         switch (cmd) {
1039         case SIOCGETVIFCNT:
1040                 if (copy_from_user(&vr,arg,sizeof(vr)))
1041                         return -EFAULT;
1042                 if (vr.vifi>=maxvif)
1043                         return -EINVAL;
1044                 read_lock(&mrt_lock);
1045                 vif=&vif_table[vr.vifi];
1046                 if (VIF_EXISTS(vr.vifi))        {
1047                         vr.icount=vif->pkt_in;
1048                         vr.ocount=vif->pkt_out;
1049                         vr.ibytes=vif->bytes_in;
1050                         vr.obytes=vif->bytes_out;
1051                         read_unlock(&mrt_lock);
1052
1053                         if (copy_to_user(arg,&vr,sizeof(vr)))
1054                                 return -EFAULT;
1055                         return 0;
1056                 }
1057                 read_unlock(&mrt_lock);
1058                 return -EADDRNOTAVAIL;
1059         case SIOCGETSGCNT:
1060                 if (copy_from_user(&sr,arg,sizeof(sr)))
1061                         return -EFAULT;
1062
1063                 read_lock(&mrt_lock);
1064                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1065                 if (c) {
1066                         sr.pktcnt = c->mfc_un.res.pkt;
1067                         sr.bytecnt = c->mfc_un.res.bytes;
1068                         sr.wrong_if = c->mfc_un.res.wrong_if;
1069                         read_unlock(&mrt_lock);
1070
1071                         if (copy_to_user(arg,&sr,sizeof(sr)))
1072                                 return -EFAULT;
1073                         return 0;
1074                 }
1075                 read_unlock(&mrt_lock);
1076                 return -EADDRNOTAVAIL;
1077         default:
1078                 return -ENOIOCTLCMD;
1079         }
1080 }
1081
1082
1083 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1084 {
1085         struct vif_device *v;
1086         int ct;
1087         if (event != NETDEV_UNREGISTER)
1088                 return NOTIFY_DONE;
1089         v=&vif_table[0];
1090         for (ct=0;ct<maxvif;ct++,v++) {
1091                 if (v->dev==ptr)
1092                         vif_delete(ct);
1093         }
1094         return NOTIFY_DONE;
1095 }
1096
1097
1098 static struct notifier_block ip_mr_notifier={
1099         .notifier_call = ipmr_device_event,
1100 };
1101
1102 /*
1103  *      Encapsulate a packet by attaching a valid IPIP header to it.
1104  *      This avoids tunnel drivers and other mess and gives us the speed so
1105  *      important for multicast video.
1106  */
1107
1108 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1109 {
1110         struct iphdr *iph;
1111         struct iphdr *old_iph = ip_hdr(skb);
1112
1113         skb_push(skb, sizeof(struct iphdr));
1114         skb->transport_header = skb->network_header;
1115         skb_reset_network_header(skb);
1116         iph = ip_hdr(skb);
1117
1118         iph->version    =       4;
1119         iph->tos        =       old_iph->tos;
1120         iph->ttl        =       old_iph->ttl;
1121         iph->frag_off   =       0;
1122         iph->daddr      =       daddr;
1123         iph->saddr      =       saddr;
1124         iph->protocol   =       IPPROTO_IPIP;
1125         iph->ihl        =       5;
1126         iph->tot_len    =       htons(skb->len);
1127         ip_select_ident(iph, skb->dst, NULL);
1128         ip_send_check(iph);
1129
1130         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1131         nf_reset(skb);
1132 }
1133
1134 static inline int ipmr_forward_finish(struct sk_buff *skb)
1135 {
1136         struct ip_options * opt = &(IPCB(skb)->opt);
1137
1138         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1139
1140         if (unlikely(opt->optlen))
1141                 ip_forward_options(skb);
1142
1143         return dst_output(skb);
1144 }
1145
1146 /*
1147  *      Processing handlers for ipmr_forward
1148  */
1149
1150 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1151 {
1152         const struct iphdr *iph = ip_hdr(skb);
1153         struct vif_device *vif = &vif_table[vifi];
1154         struct net_device *dev;
1155         struct rtable *rt;
1156         int    encap = 0;
1157
1158         if (vif->dev == NULL)
1159                 goto out_free;
1160
1161 #ifdef CONFIG_IP_PIMSM
1162         if (vif->flags & VIFF_REGISTER) {
1163                 vif->pkt_out++;
1164                 vif->bytes_out+=skb->len;
1165                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1166                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1167                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1168                 kfree_skb(skb);
1169                 return;
1170         }
1171 #endif
1172
1173         if (vif->flags&VIFF_TUNNEL) {
1174                 struct flowi fl = { .oif = vif->link,
1175                                     .nl_u = { .ip4_u =
1176                                               { .daddr = vif->remote,
1177                                                 .saddr = vif->local,
1178                                                 .tos = RT_TOS(iph->tos) } },
1179                                     .proto = IPPROTO_IPIP };
1180                 if (ip_route_output_key(&rt, &fl))
1181                         goto out_free;
1182                 encap = sizeof(struct iphdr);
1183         } else {
1184                 struct flowi fl = { .oif = vif->link,
1185                                     .nl_u = { .ip4_u =
1186                                               { .daddr = iph->daddr,
1187                                                 .tos = RT_TOS(iph->tos) } },
1188                                     .proto = IPPROTO_IPIP };
1189                 if (ip_route_output_key(&rt, &fl))
1190                         goto out_free;
1191         }
1192
1193         dev = rt->u.dst.dev;
1194
1195         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1196                 /* Do not fragment multicasts. Alas, IPv4 does not
1197                    allow to send ICMP, so that packets will disappear
1198                    to blackhole.
1199                  */
1200
1201                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1202                 ip_rt_put(rt);
1203                 goto out_free;
1204         }
1205
1206         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1207
1208         if (skb_cow(skb, encap)) {
1209                 ip_rt_put(rt);
1210                 goto out_free;
1211         }
1212
1213         vif->pkt_out++;
1214         vif->bytes_out+=skb->len;
1215
1216         dst_release(skb->dst);
1217         skb->dst = &rt->u.dst;
1218         ip_decrease_ttl(ip_hdr(skb));
1219
1220         /* FIXME: forward and output firewalls used to be called here.
1221          * What do we do with netfilter? -- RR */
1222         if (vif->flags & VIFF_TUNNEL) {
1223                 ip_encap(skb, vif->local, vif->remote);
1224                 /* FIXME: extra output firewall step used to be here. --RR */
1225                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1226                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1227         }
1228
1229         IPCB(skb)->flags |= IPSKB_FORWARDED;
1230
1231         /*
1232          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1233          * not only before forwarding, but after forwarding on all output
1234          * interfaces. It is clear, if mrouter runs a multicasting
1235          * program, it should receive packets not depending to what interface
1236          * program is joined.
1237          * If we will not make it, the program will have to join on all
1238          * interfaces. On the other hand, multihoming host (or router, but
1239          * not mrouter) cannot join to more than one interface - it will
1240          * result in receiving multiple packets.
1241          */
1242         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1243                 ipmr_forward_finish);
1244         return;
1245
1246 out_free:
1247         kfree_skb(skb);
1248         return;
1249 }
1250
1251 static int ipmr_find_vif(struct net_device *dev)
1252 {
1253         int ct;
1254         for (ct=maxvif-1; ct>=0; ct--) {
1255                 if (vif_table[ct].dev == dev)
1256                         break;
1257         }
1258         return ct;
1259 }
1260
1261 /* "local" means that we should preserve one skb (for local delivery) */
1262
1263 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1264 {
1265         int psend = -1;
1266         int vif, ct;
1267
1268         vif = cache->mfc_parent;
1269         cache->mfc_un.res.pkt++;
1270         cache->mfc_un.res.bytes += skb->len;
1271
1272         /*
1273          * Wrong interface: drop packet and (maybe) send PIM assert.
1274          */
1275         if (vif_table[vif].dev != skb->dev) {
1276                 int true_vifi;
1277
1278                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1279                         /* It is our own packet, looped back.
1280                            Very complicated situation...
1281
1282                            The best workaround until routing daemons will be
1283                            fixed is not to redistribute packet, if it was
1284                            send through wrong interface. It means, that
1285                            multicast applications WILL NOT work for
1286                            (S,G), which have default multicast route pointing
1287                            to wrong oif. In any case, it is not a good
1288                            idea to use multicasting applications on router.
1289                          */
1290                         goto dont_forward;
1291                 }
1292
1293                 cache->mfc_un.res.wrong_if++;
1294                 true_vifi = ipmr_find_vif(skb->dev);
1295
1296                 if (true_vifi >= 0 && mroute_do_assert &&
1297                     /* pimsm uses asserts, when switching from RPT to SPT,
1298                        so that we cannot check that packet arrived on an oif.
1299                        It is bad, but otherwise we would need to move pretty
1300                        large chunk of pimd to kernel. Ough... --ANK
1301                      */
1302                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1303                     time_after(jiffies,
1304                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1305                         cache->mfc_un.res.last_assert = jiffies;
1306                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1307                 }
1308                 goto dont_forward;
1309         }
1310
1311         vif_table[vif].pkt_in++;
1312         vif_table[vif].bytes_in+=skb->len;
1313
1314         /*
1315          *      Forward the frame
1316          */
1317         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1318                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1319                         if (psend != -1) {
1320                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1321                                 if (skb2)
1322                                         ipmr_queue_xmit(skb2, cache, psend);
1323                         }
1324                         psend=ct;
1325                 }
1326         }
1327         if (psend != -1) {
1328                 if (local) {
1329                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1330                         if (skb2)
1331                                 ipmr_queue_xmit(skb2, cache, psend);
1332                 } else {
1333                         ipmr_queue_xmit(skb, cache, psend);
1334                         return 0;
1335                 }
1336         }
1337
1338 dont_forward:
1339         if (!local)
1340                 kfree_skb(skb);
1341         return 0;
1342 }
1343
1344
1345 /*
1346  *      Multicast packets for forwarding arrive here
1347  */
1348
1349 int ip_mr_input(struct sk_buff *skb)
1350 {
1351         struct mfc_cache *cache;
1352         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1353
1354         /* Packet is looped back after forward, it should not be
1355            forwarded second time, but still can be delivered locally.
1356          */
1357         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1358                 goto dont_forward;
1359
1360         if (!local) {
1361                     if (IPCB(skb)->opt.router_alert) {
1362                             if (ip_call_ra_chain(skb))
1363                                     return 0;
1364                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1365                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1366                                Cisco IOS <= 11.2(8)) do not put router alert
1367                                option to IGMP packets destined to routable
1368                                groups. It is very bad, because it means
1369                                that we can forward NO IGMP messages.
1370                              */
1371                             read_lock(&mrt_lock);
1372                             if (mroute_socket) {
1373                                     nf_reset(skb);
1374                                     raw_rcv(mroute_socket, skb);
1375                                     read_unlock(&mrt_lock);
1376                                     return 0;
1377                             }
1378                             read_unlock(&mrt_lock);
1379                     }
1380         }
1381
1382         read_lock(&mrt_lock);
1383         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1384
1385         /*
1386          *      No usable cache entry
1387          */
1388         if (cache==NULL) {
1389                 int vif;
1390
1391                 if (local) {
1392                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1393                         ip_local_deliver(skb);
1394                         if (skb2 == NULL) {
1395                                 read_unlock(&mrt_lock);
1396                                 return -ENOBUFS;
1397                         }
1398                         skb = skb2;
1399                 }
1400
1401                 vif = ipmr_find_vif(skb->dev);
1402                 if (vif >= 0) {
1403                         int err = ipmr_cache_unresolved(vif, skb);
1404                         read_unlock(&mrt_lock);
1405
1406                         return err;
1407                 }
1408                 read_unlock(&mrt_lock);
1409                 kfree_skb(skb);
1410                 return -ENODEV;
1411         }
1412
1413         ip_mr_forward(skb, cache, local);
1414
1415         read_unlock(&mrt_lock);
1416
1417         if (local)
1418                 return ip_local_deliver(skb);
1419
1420         return 0;
1421
1422 dont_forward:
1423         if (local)
1424                 return ip_local_deliver(skb);
1425         kfree_skb(skb);
1426         return 0;
1427 }
1428
1429 #ifdef CONFIG_IP_PIMSM_V1
1430 /*
1431  * Handle IGMP messages of PIMv1
1432  */
1433
1434 int pim_rcv_v1(struct sk_buff * skb)
1435 {
1436         struct igmphdr *pim;
1437         struct iphdr   *encap;
1438         struct net_device  *reg_dev = NULL;
1439
1440         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1441                 goto drop;
1442
1443         pim = igmp_hdr(skb);
1444
1445         if (!mroute_do_pim ||
1446             skb->len < sizeof(*pim) + sizeof(*encap) ||
1447             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1448                 goto drop;
1449
1450         encap = (struct iphdr *)(skb_transport_header(skb) +
1451                                  sizeof(struct igmphdr));
1452         /*
1453            Check that:
1454            a. packet is really destinted to a multicast group
1455            b. packet is not a NULL-REGISTER
1456            c. packet is not truncated
1457          */
1458         if (!MULTICAST(encap->daddr) ||
1459             encap->tot_len == 0 ||
1460             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1461                 goto drop;
1462
1463         read_lock(&mrt_lock);
1464         if (reg_vif_num >= 0)
1465                 reg_dev = vif_table[reg_vif_num].dev;
1466         if (reg_dev)
1467                 dev_hold(reg_dev);
1468         read_unlock(&mrt_lock);
1469
1470         if (reg_dev == NULL)
1471                 goto drop;
1472
1473         skb->mac_header = skb->network_header;
1474         skb_pull(skb, (u8*)encap - skb->data);
1475         skb_reset_network_header(skb);
1476         skb->dev = reg_dev;
1477         skb->protocol = htons(ETH_P_IP);
1478         skb->ip_summed = 0;
1479         skb->pkt_type = PACKET_HOST;
1480         dst_release(skb->dst);
1481         skb->dst = NULL;
1482         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1483         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1484         nf_reset(skb);
1485         netif_rx(skb);
1486         dev_put(reg_dev);
1487         return 0;
1488  drop:
1489         kfree_skb(skb);
1490         return 0;
1491 }
1492 #endif
1493
1494 #ifdef CONFIG_IP_PIMSM_V2
1495 static int pim_rcv(struct sk_buff * skb)
1496 {
1497         struct pimreghdr *pim;
1498         struct iphdr   *encap;
1499         struct net_device  *reg_dev = NULL;
1500
1501         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1502                 goto drop;
1503
1504         pim = (struct pimreghdr *)skb_transport_header(skb);
1505         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1506             (pim->flags&PIM_NULL_REGISTER) ||
1507             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1508              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1509                 goto drop;
1510
1511         /* check if the inner packet is destined to mcast group */
1512         encap = (struct iphdr *)(skb_transport_header(skb) +
1513                                  sizeof(struct pimreghdr));
1514         if (!MULTICAST(encap->daddr) ||
1515             encap->tot_len == 0 ||
1516             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1517                 goto drop;
1518
1519         read_lock(&mrt_lock);
1520         if (reg_vif_num >= 0)
1521                 reg_dev = vif_table[reg_vif_num].dev;
1522         if (reg_dev)
1523                 dev_hold(reg_dev);
1524         read_unlock(&mrt_lock);
1525
1526         if (reg_dev == NULL)
1527                 goto drop;
1528
1529         skb->mac_header = skb->network_header;
1530         skb_pull(skb, (u8*)encap - skb->data);
1531         skb_reset_network_header(skb);
1532         skb->dev = reg_dev;
1533         skb->protocol = htons(ETH_P_IP);
1534         skb->ip_summed = 0;
1535         skb->pkt_type = PACKET_HOST;
1536         dst_release(skb->dst);
1537         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1538         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1539         skb->dst = NULL;
1540         nf_reset(skb);
1541         netif_rx(skb);
1542         dev_put(reg_dev);
1543         return 0;
1544  drop:
1545         kfree_skb(skb);
1546         return 0;
1547 }
1548 #endif
1549
1550 static int
1551 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1552 {
1553         int ct;
1554         struct rtnexthop *nhp;
1555         struct net_device *dev = vif_table[c->mfc_parent].dev;
1556         u8 *b = skb_tail_pointer(skb);
1557         struct rtattr *mp_head;
1558
1559         if (dev)
1560                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1561
1562         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1563
1564         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1565                 if (c->mfc_un.res.ttls[ct] < 255) {
1566                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1567                                 goto rtattr_failure;
1568                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1569                         nhp->rtnh_flags = 0;
1570                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1571                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1572                         nhp->rtnh_len = sizeof(*nhp);
1573                 }
1574         }
1575         mp_head->rta_type = RTA_MULTIPATH;
1576         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1577         rtm->rtm_type = RTN_MULTICAST;
1578         return 1;
1579
1580 rtattr_failure:
1581         nlmsg_trim(skb, b);
1582         return -EMSGSIZE;
1583 }
1584
1585 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1586 {
1587         int err;
1588         struct mfc_cache *cache;
1589         struct rtable *rt = (struct rtable*)skb->dst;
1590
1591         read_lock(&mrt_lock);
1592         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1593
1594         if (cache==NULL) {
1595                 struct sk_buff *skb2;
1596                 struct iphdr *iph;
1597                 struct net_device *dev;
1598                 int vif;
1599
1600                 if (nowait) {
1601                         read_unlock(&mrt_lock);
1602                         return -EAGAIN;
1603                 }
1604
1605                 dev = skb->dev;
1606                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1607                         read_unlock(&mrt_lock);
1608                         return -ENODEV;
1609                 }
1610                 skb2 = skb_clone(skb, GFP_ATOMIC);
1611                 if (!skb2) {
1612                         read_unlock(&mrt_lock);
1613                         return -ENOMEM;
1614                 }
1615
1616                 skb_push(skb2, sizeof(struct iphdr));
1617                 skb_reset_network_header(skb2);
1618                 iph = ip_hdr(skb2);
1619                 iph->ihl = sizeof(struct iphdr) >> 2;
1620                 iph->saddr = rt->rt_src;
1621                 iph->daddr = rt->rt_dst;
1622                 iph->version = 0;
1623                 err = ipmr_cache_unresolved(vif, skb2);
1624                 read_unlock(&mrt_lock);
1625                 return err;
1626         }
1627
1628         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1629                 cache->mfc_flags |= MFC_NOTIFY;
1630         err = ipmr_fill_mroute(skb, cache, rtm);
1631         read_unlock(&mrt_lock);
1632         return err;
1633 }
1634
1635 #ifdef CONFIG_PROC_FS
1636 /*
1637  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1638  */
1639 struct ipmr_vif_iter {
1640         int ct;
1641 };
1642
1643 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1644                                            loff_t pos)
1645 {
1646         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1647                 if (!VIF_EXISTS(iter->ct))
1648                         continue;
1649                 if (pos-- == 0)
1650                         return &vif_table[iter->ct];
1651         }
1652         return NULL;
1653 }
1654
1655 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1656 {
1657         read_lock(&mrt_lock);
1658         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1659                 : SEQ_START_TOKEN;
1660 }
1661
1662 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1663 {
1664         struct ipmr_vif_iter *iter = seq->private;
1665
1666         ++*pos;
1667         if (v == SEQ_START_TOKEN)
1668                 return ipmr_vif_seq_idx(iter, 0);
1669
1670         while (++iter->ct < maxvif) {
1671                 if (!VIF_EXISTS(iter->ct))
1672                         continue;
1673                 return &vif_table[iter->ct];
1674         }
1675         return NULL;
1676 }
1677
1678 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1679 {
1680         read_unlock(&mrt_lock);
1681 }
1682
1683 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1684 {
1685         if (v == SEQ_START_TOKEN) {
1686                 seq_puts(seq,
1687                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1688         } else {
1689                 const struct vif_device *vif = v;
1690                 const char *name =  vif->dev ? vif->dev->name : "none";
1691
1692                 seq_printf(seq,
1693                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1694                            vif - vif_table,
1695                            name, vif->bytes_in, vif->pkt_in,
1696                            vif->bytes_out, vif->pkt_out,
1697                            vif->flags, vif->local, vif->remote);
1698         }
1699         return 0;
1700 }
1701
1702 static const struct seq_operations ipmr_vif_seq_ops = {
1703         .start = ipmr_vif_seq_start,
1704         .next  = ipmr_vif_seq_next,
1705         .stop  = ipmr_vif_seq_stop,
1706         .show  = ipmr_vif_seq_show,
1707 };
1708
1709 static int ipmr_vif_open(struct inode *inode, struct file *file)
1710 {
1711         struct seq_file *seq;
1712         int rc = -ENOMEM;
1713         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1714
1715         if (!s)
1716                 goto out;
1717
1718         rc = seq_open(file, &ipmr_vif_seq_ops);
1719         if (rc)
1720                 goto out_kfree;
1721
1722         s->ct = 0;
1723         seq = file->private_data;
1724         seq->private = s;
1725 out:
1726         return rc;
1727 out_kfree:
1728         kfree(s);
1729         goto out;
1730
1731 }
1732
1733 static const struct file_operations ipmr_vif_fops = {
1734         .owner   = THIS_MODULE,
1735         .open    = ipmr_vif_open,
1736         .read    = seq_read,
1737         .llseek  = seq_lseek,
1738         .release = seq_release_private,
1739 };
1740
1741 struct ipmr_mfc_iter {
1742         struct mfc_cache **cache;
1743         int ct;
1744 };
1745
1746
1747 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1748 {
1749         struct mfc_cache *mfc;
1750
1751         it->cache = mfc_cache_array;
1752         read_lock(&mrt_lock);
1753         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1754                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1755                         if (pos-- == 0)
1756                                 return mfc;
1757         read_unlock(&mrt_lock);
1758
1759         it->cache = &mfc_unres_queue;
1760         spin_lock_bh(&mfc_unres_lock);
1761         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1762                 if (pos-- == 0)
1763                         return mfc;
1764         spin_unlock_bh(&mfc_unres_lock);
1765
1766         it->cache = NULL;
1767         return NULL;
1768 }
1769
1770
1771 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1772 {
1773         struct ipmr_mfc_iter *it = seq->private;
1774         it->cache = NULL;
1775         it->ct = 0;
1776         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1777                 : SEQ_START_TOKEN;
1778 }
1779
1780 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1781 {
1782         struct mfc_cache *mfc = v;
1783         struct ipmr_mfc_iter *it = seq->private;
1784
1785         ++*pos;
1786
1787         if (v == SEQ_START_TOKEN)
1788                 return ipmr_mfc_seq_idx(seq->private, 0);
1789
1790         if (mfc->next)
1791                 return mfc->next;
1792
1793         if (it->cache == &mfc_unres_queue)
1794                 goto end_of_list;
1795
1796         BUG_ON(it->cache != mfc_cache_array);
1797
1798         while (++it->ct < MFC_LINES) {
1799                 mfc = mfc_cache_array[it->ct];
1800                 if (mfc)
1801                         return mfc;
1802         }
1803
1804         /* exhausted cache_array, show unresolved */
1805         read_unlock(&mrt_lock);
1806         it->cache = &mfc_unres_queue;
1807         it->ct = 0;
1808
1809         spin_lock_bh(&mfc_unres_lock);
1810         mfc = mfc_unres_queue;
1811         if (mfc)
1812                 return mfc;
1813
1814  end_of_list:
1815         spin_unlock_bh(&mfc_unres_lock);
1816         it->cache = NULL;
1817
1818         return NULL;
1819 }
1820
1821 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1822 {
1823         struct ipmr_mfc_iter *it = seq->private;
1824
1825         if (it->cache == &mfc_unres_queue)
1826                 spin_unlock_bh(&mfc_unres_lock);
1827         else if (it->cache == mfc_cache_array)
1828                 read_unlock(&mrt_lock);
1829 }
1830
1831 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1832 {
1833         int n;
1834
1835         if (v == SEQ_START_TOKEN) {
1836                 seq_puts(seq,
1837                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1838         } else {
1839                 const struct mfc_cache *mfc = v;
1840                 const struct ipmr_mfc_iter *it = seq->private;
1841
1842                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1843                            (unsigned long) mfc->mfc_mcastgrp,
1844                            (unsigned long) mfc->mfc_origin,
1845                            mfc->mfc_parent,
1846                            mfc->mfc_un.res.pkt,
1847                            mfc->mfc_un.res.bytes,
1848                            mfc->mfc_un.res.wrong_if);
1849
1850                 if (it->cache != &mfc_unres_queue) {
1851                         for (n = mfc->mfc_un.res.minvif;
1852                              n < mfc->mfc_un.res.maxvif; n++ ) {
1853                                 if (VIF_EXISTS(n)
1854                                    && mfc->mfc_un.res.ttls[n] < 255)
1855                                 seq_printf(seq,
1856                                            " %2d:%-3d",
1857                                            n, mfc->mfc_un.res.ttls[n]);
1858                         }
1859                 }
1860                 seq_putc(seq, '\n');
1861         }
1862         return 0;
1863 }
1864
1865 static const struct seq_operations ipmr_mfc_seq_ops = {
1866         .start = ipmr_mfc_seq_start,
1867         .next  = ipmr_mfc_seq_next,
1868         .stop  = ipmr_mfc_seq_stop,
1869         .show  = ipmr_mfc_seq_show,
1870 };
1871
1872 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1873 {
1874         struct seq_file *seq;
1875         int rc = -ENOMEM;
1876         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1877
1878         if (!s)
1879                 goto out;
1880
1881         rc = seq_open(file, &ipmr_mfc_seq_ops);
1882         if (rc)
1883                 goto out_kfree;
1884
1885         seq = file->private_data;
1886         seq->private = s;
1887 out:
1888         return rc;
1889 out_kfree:
1890         kfree(s);
1891         goto out;
1892
1893 }
1894
1895 static const struct file_operations ipmr_mfc_fops = {
1896         .owner   = THIS_MODULE,
1897         .open    = ipmr_mfc_open,
1898         .read    = seq_read,
1899         .llseek  = seq_lseek,
1900         .release = seq_release_private,
1901 };
1902 #endif
1903
1904 #ifdef CONFIG_IP_PIMSM_V2
1905 static struct net_protocol pim_protocol = {
1906         .handler        =       pim_rcv,
1907 };
1908 #endif
1909
1910
1911 /*
1912  *      Setup for IP multicast routing
1913  */
1914
1915 void __init ip_mr_init(void)
1916 {
1917         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1918                                        sizeof(struct mfc_cache),
1919                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1920                                        NULL);
1921         init_timer(&ipmr_expire_timer);
1922         ipmr_expire_timer.function=ipmr_expire_process;
1923         register_netdevice_notifier(&ip_mr_notifier);
1924 #ifdef CONFIG_PROC_FS
1925         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1926         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1927 #endif
1928 }