[NET]: Eliminate duplicate copies of dst_discard
[linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/net_namespace.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66 #include <net/netlink.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 static struct sock *mroute_socket;
73
74
75 /* Big lock, protecting vif table, mrt cache and mroute socket state.
76    Note that the changes are semaphored via rtnl_lock.
77  */
78
79 static DEFINE_RWLOCK(mrt_lock);
80
81 /*
82  *      Multicast router control variables
83  */
84
85 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
86 static int maxvif;
87
88 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
89
90 static int mroute_do_assert;                            /* Set in PIM assert    */
91 static int mroute_do_pim;
92
93 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
94
95 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
96 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
97
98 /* Special spinlock for queue of unresolved entries */
99 static DEFINE_SPINLOCK(mfc_unres_lock);
100
101 /* We return to original Alan's scheme. Hash table of resolved
102    entries is changed only in process context and protected
103    with weak lock mrt_lock. Queue of unresolved entries is protected
104    with strong spinlock mfc_unres_lock.
105
106    In this case data path is free of exclusive locks at all.
107  */
108
109 static struct kmem_cache *mrt_cachep __read_mostly;
110
111 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
112 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
113 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
114
115 #ifdef CONFIG_IP_PIMSM_V2
116 static struct net_protocol pim_protocol;
117 #endif
118
119 static struct timer_list ipmr_expire_timer;
120
121 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
122
123 static
124 struct net_device *ipmr_new_tunnel(struct vifctl *v)
125 {
126         struct net_device  *dev;
127
128         dev = __dev_get_by_name(&init_net, "tunl0");
129
130         if (dev) {
131                 int err;
132                 struct ifreq ifr;
133                 mm_segment_t    oldfs;
134                 struct ip_tunnel_parm p;
135                 struct in_device  *in_dev;
136
137                 memset(&p, 0, sizeof(p));
138                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
139                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
140                 p.iph.version = 4;
141                 p.iph.ihl = 5;
142                 p.iph.protocol = IPPROTO_IPIP;
143                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
144                 ifr.ifr_ifru.ifru_data = (void*)&p;
145
146                 oldfs = get_fs(); set_fs(KERNEL_DS);
147                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
148                 set_fs(oldfs);
149
150                 dev = NULL;
151
152                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
153                         dev->flags |= IFF_MULTICAST;
154
155                         in_dev = __in_dev_get_rtnl(dev);
156                         if (in_dev == NULL)
157                                 goto failure;
158
159                         ipv4_devconf_setall(in_dev);
160                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
161
162                         if (dev_open(dev))
163                                 goto failure;
164                 }
165         }
166         return dev;
167
168 failure:
169         /* allow the register to be completed before unregistering. */
170         rtnl_unlock();
171         rtnl_lock();
172
173         unregister_netdevice(dev);
174         return NULL;
175 }
176
177 #ifdef CONFIG_IP_PIMSM
178
179 static int reg_vif_num = -1;
180
181 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
182 {
183         read_lock(&mrt_lock);
184         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
185         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
186         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
187         read_unlock(&mrt_lock);
188         kfree_skb(skb);
189         return 0;
190 }
191
192 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
193 {
194         return (struct net_device_stats*)netdev_priv(dev);
195 }
196
197 static void reg_vif_setup(struct net_device *dev)
198 {
199         dev->type               = ARPHRD_PIMREG;
200         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
201         dev->flags              = IFF_NOARP;
202         dev->hard_start_xmit    = reg_vif_xmit;
203         dev->get_stats          = reg_vif_get_stats;
204         dev->destructor         = free_netdev;
205 }
206
207 static struct net_device *ipmr_reg_vif(void)
208 {
209         struct net_device *dev;
210         struct in_device *in_dev;
211
212         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
213                            reg_vif_setup);
214
215         if (dev == NULL)
216                 return NULL;
217
218         if (register_netdevice(dev)) {
219                 free_netdev(dev);
220                 return NULL;
221         }
222         dev->iflink = 0;
223
224         rcu_read_lock();
225         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
226                 rcu_read_unlock();
227                 goto failure;
228         }
229
230         ipv4_devconf_setall(in_dev);
231         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
232         rcu_read_unlock();
233
234         if (dev_open(dev))
235                 goto failure;
236
237         return dev;
238
239 failure:
240         /* allow the register to be completed before unregistering. */
241         rtnl_unlock();
242         rtnl_lock();
243
244         unregister_netdevice(dev);
245         return NULL;
246 }
247 #endif
248
249 /*
250  *      Delete a VIF entry
251  */
252
253 static int vif_delete(int vifi)
254 {
255         struct vif_device *v;
256         struct net_device *dev;
257         struct in_device *in_dev;
258
259         if (vifi < 0 || vifi >= maxvif)
260                 return -EADDRNOTAVAIL;
261
262         v = &vif_table[vifi];
263
264         write_lock_bh(&mrt_lock);
265         dev = v->dev;
266         v->dev = NULL;
267
268         if (!dev) {
269                 write_unlock_bh(&mrt_lock);
270                 return -EADDRNOTAVAIL;
271         }
272
273 #ifdef CONFIG_IP_PIMSM
274         if (vifi == reg_vif_num)
275                 reg_vif_num = -1;
276 #endif
277
278         if (vifi+1 == maxvif) {
279                 int tmp;
280                 for (tmp=vifi-1; tmp>=0; tmp--) {
281                         if (VIF_EXISTS(tmp))
282                                 break;
283                 }
284                 maxvif = tmp+1;
285         }
286
287         write_unlock_bh(&mrt_lock);
288
289         dev_set_allmulti(dev, -1);
290
291         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
292                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
293                 ip_rt_multicast_event(in_dev);
294         }
295
296         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
297                 unregister_netdevice(dev);
298
299         dev_put(dev);
300         return 0;
301 }
302
303 /* Destroy an unresolved cache entry, killing queued skbs
304    and reporting error to netlink readers.
305  */
306
307 static void ipmr_destroy_unres(struct mfc_cache *c)
308 {
309         struct sk_buff *skb;
310         struct nlmsgerr *e;
311
312         atomic_dec(&cache_resolve_queue_len);
313
314         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
315                 if (ip_hdr(skb)->version == 0) {
316                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
317                         nlh->nlmsg_type = NLMSG_ERROR;
318                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
319                         skb_trim(skb, nlh->nlmsg_len);
320                         e = NLMSG_DATA(nlh);
321                         e->error = -ETIMEDOUT;
322                         memset(&e->msg, 0, sizeof(e->msg));
323
324                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
325                 } else
326                         kfree_skb(skb);
327         }
328
329         kmem_cache_free(mrt_cachep, c);
330 }
331
332
333 /* Single timer process for all the unresolved queue. */
334
335 static void ipmr_expire_process(unsigned long dummy)
336 {
337         unsigned long now;
338         unsigned long expires;
339         struct mfc_cache *c, **cp;
340
341         if (!spin_trylock(&mfc_unres_lock)) {
342                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
343                 return;
344         }
345
346         if (atomic_read(&cache_resolve_queue_len) == 0)
347                 goto out;
348
349         now = jiffies;
350         expires = 10*HZ;
351         cp = &mfc_unres_queue;
352
353         while ((c=*cp) != NULL) {
354                 if (time_after(c->mfc_un.unres.expires, now)) {
355                         unsigned long interval = c->mfc_un.unres.expires - now;
356                         if (interval < expires)
357                                 expires = interval;
358                         cp = &c->next;
359                         continue;
360                 }
361
362                 *cp = c->next;
363
364                 ipmr_destroy_unres(c);
365         }
366
367         if (atomic_read(&cache_resolve_queue_len))
368                 mod_timer(&ipmr_expire_timer, jiffies + expires);
369
370 out:
371         spin_unlock(&mfc_unres_lock);
372 }
373
374 /* Fill oifs list. It is called under write locked mrt_lock. */
375
376 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
377 {
378         int vifi;
379
380         cache->mfc_un.res.minvif = MAXVIFS;
381         cache->mfc_un.res.maxvif = 0;
382         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
383
384         for (vifi=0; vifi<maxvif; vifi++) {
385                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
386                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
387                         if (cache->mfc_un.res.minvif > vifi)
388                                 cache->mfc_un.res.minvif = vifi;
389                         if (cache->mfc_un.res.maxvif <= vifi)
390                                 cache->mfc_un.res.maxvif = vifi + 1;
391                 }
392         }
393 }
394
395 static int vif_add(struct vifctl *vifc, int mrtsock)
396 {
397         int vifi = vifc->vifc_vifi;
398         struct vif_device *v = &vif_table[vifi];
399         struct net_device *dev;
400         struct in_device *in_dev;
401
402         /* Is vif busy ? */
403         if (VIF_EXISTS(vifi))
404                 return -EADDRINUSE;
405
406         switch (vifc->vifc_flags) {
407 #ifdef CONFIG_IP_PIMSM
408         case VIFF_REGISTER:
409                 /*
410                  * Special Purpose VIF in PIM
411                  * All the packets will be sent to the daemon
412                  */
413                 if (reg_vif_num >= 0)
414                         return -EADDRINUSE;
415                 dev = ipmr_reg_vif();
416                 if (!dev)
417                         return -ENOBUFS;
418                 break;
419 #endif
420         case VIFF_TUNNEL:
421                 dev = ipmr_new_tunnel(vifc);
422                 if (!dev)
423                         return -ENOBUFS;
424                 break;
425         case 0:
426                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
427                 if (!dev)
428                         return -EADDRNOTAVAIL;
429                 dev_put(dev);
430                 break;
431         default:
432                 return -EINVAL;
433         }
434
435         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
436                 return -EADDRNOTAVAIL;
437         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
438         dev_set_allmulti(dev, +1);
439         ip_rt_multicast_event(in_dev);
440
441         /*
442          *      Fill in the VIF structures
443          */
444         v->rate_limit=vifc->vifc_rate_limit;
445         v->local=vifc->vifc_lcl_addr.s_addr;
446         v->remote=vifc->vifc_rmt_addr.s_addr;
447         v->flags=vifc->vifc_flags;
448         if (!mrtsock)
449                 v->flags |= VIFF_STATIC;
450         v->threshold=vifc->vifc_threshold;
451         v->bytes_in = 0;
452         v->bytes_out = 0;
453         v->pkt_in = 0;
454         v->pkt_out = 0;
455         v->link = dev->ifindex;
456         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
457                 v->link = dev->iflink;
458
459         /* And finish update writing critical data */
460         write_lock_bh(&mrt_lock);
461         dev_hold(dev);
462         v->dev=dev;
463 #ifdef CONFIG_IP_PIMSM
464         if (v->flags&VIFF_REGISTER)
465                 reg_vif_num = vifi;
466 #endif
467         if (vifi+1 > maxvif)
468                 maxvif = vifi+1;
469         write_unlock_bh(&mrt_lock);
470         return 0;
471 }
472
473 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
474 {
475         int line=MFC_HASH(mcastgrp,origin);
476         struct mfc_cache *c;
477
478         for (c=mfc_cache_array[line]; c; c = c->next) {
479                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
480                         break;
481         }
482         return c;
483 }
484
485 /*
486  *      Allocate a multicast cache entry
487  */
488 static struct mfc_cache *ipmr_cache_alloc(void)
489 {
490         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
491         if (c==NULL)
492                 return NULL;
493         c->mfc_un.res.minvif = MAXVIFS;
494         return c;
495 }
496
497 static struct mfc_cache *ipmr_cache_alloc_unres(void)
498 {
499         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
500         if (c==NULL)
501                 return NULL;
502         skb_queue_head_init(&c->mfc_un.unres.unresolved);
503         c->mfc_un.unres.expires = jiffies + 10*HZ;
504         return c;
505 }
506
507 /*
508  *      A cache entry has gone into a resolved state from queued
509  */
510
511 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
512 {
513         struct sk_buff *skb;
514         struct nlmsgerr *e;
515
516         /*
517          *      Play the pending entries through our router
518          */
519
520         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
521                 if (ip_hdr(skb)->version == 0) {
522                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
523
524                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
525                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
526                                                   (u8 *)nlh);
527                         } else {
528                                 nlh->nlmsg_type = NLMSG_ERROR;
529                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
530                                 skb_trim(skb, nlh->nlmsg_len);
531                                 e = NLMSG_DATA(nlh);
532                                 e->error = -EMSGSIZE;
533                                 memset(&e->msg, 0, sizeof(e->msg));
534                         }
535
536                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
537                 } else
538                         ip_mr_forward(skb, c, 0);
539         }
540 }
541
542 /*
543  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
544  *      expects the following bizarre scheme.
545  *
546  *      Called under mrt_lock.
547  */
548
549 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
550 {
551         struct sk_buff *skb;
552         const int ihl = ip_hdrlen(pkt);
553         struct igmphdr *igmp;
554         struct igmpmsg *msg;
555         int ret;
556
557 #ifdef CONFIG_IP_PIMSM
558         if (assert == IGMPMSG_WHOLEPKT)
559                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
560         else
561 #endif
562                 skb = alloc_skb(128, GFP_ATOMIC);
563
564         if (!skb)
565                 return -ENOBUFS;
566
567 #ifdef CONFIG_IP_PIMSM
568         if (assert == IGMPMSG_WHOLEPKT) {
569                 /* Ugly, but we have no choice with this interface.
570                    Duplicate old header, fix ihl, length etc.
571                    And all this only to mangle msg->im_msgtype and
572                    to set msg->im_mbz to "mbz" :-)
573                  */
574                 skb_push(skb, sizeof(struct iphdr));
575                 skb_reset_network_header(skb);
576                 skb_reset_transport_header(skb);
577                 msg = (struct igmpmsg *)skb_network_header(skb);
578                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
579                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
580                 msg->im_mbz = 0;
581                 msg->im_vif = reg_vif_num;
582                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
583                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
584                                              sizeof(struct iphdr));
585         } else
586 #endif
587         {
588
589         /*
590          *      Copy the IP header
591          */
592
593         skb->network_header = skb->tail;
594         skb_put(skb, ihl);
595         skb_copy_to_linear_data(skb, pkt->data, ihl);
596         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
597         msg = (struct igmpmsg *)skb_network_header(skb);
598         msg->im_vif = vifi;
599         skb->dst = dst_clone(pkt->dst);
600
601         /*
602          *      Add our header
603          */
604
605         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
606         igmp->type      =
607         msg->im_msgtype = assert;
608         igmp->code      =       0;
609         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
610         skb->transport_header = skb->network_header;
611         }
612
613         if (mroute_socket == NULL) {
614                 kfree_skb(skb);
615                 return -EINVAL;
616         }
617
618         /*
619          *      Deliver to mrouted
620          */
621         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
622                 if (net_ratelimit())
623                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
624                 kfree_skb(skb);
625         }
626
627         return ret;
628 }
629
630 /*
631  *      Queue a packet for resolution. It gets locked cache entry!
632  */
633
634 static int
635 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
636 {
637         int err;
638         struct mfc_cache *c;
639         const struct iphdr *iph = ip_hdr(skb);
640
641         spin_lock_bh(&mfc_unres_lock);
642         for (c=mfc_unres_queue; c; c=c->next) {
643                 if (c->mfc_mcastgrp == iph->daddr &&
644                     c->mfc_origin == iph->saddr)
645                         break;
646         }
647
648         if (c == NULL) {
649                 /*
650                  *      Create a new entry if allowable
651                  */
652
653                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
654                     (c=ipmr_cache_alloc_unres())==NULL) {
655                         spin_unlock_bh(&mfc_unres_lock);
656
657                         kfree_skb(skb);
658                         return -ENOBUFS;
659                 }
660
661                 /*
662                  *      Fill in the new cache entry
663                  */
664                 c->mfc_parent   = -1;
665                 c->mfc_origin   = iph->saddr;
666                 c->mfc_mcastgrp = iph->daddr;
667
668                 /*
669                  *      Reflect first query at mrouted.
670                  */
671                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
672                         /* If the report failed throw the cache entry
673                            out - Brad Parker
674                          */
675                         spin_unlock_bh(&mfc_unres_lock);
676
677                         kmem_cache_free(mrt_cachep, c);
678                         kfree_skb(skb);
679                         return err;
680                 }
681
682                 atomic_inc(&cache_resolve_queue_len);
683                 c->next = mfc_unres_queue;
684                 mfc_unres_queue = c;
685
686                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
687         }
688
689         /*
690          *      See if we can append the packet
691          */
692         if (c->mfc_un.unres.unresolved.qlen>3) {
693                 kfree_skb(skb);
694                 err = -ENOBUFS;
695         } else {
696                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
697                 err = 0;
698         }
699
700         spin_unlock_bh(&mfc_unres_lock);
701         return err;
702 }
703
704 /*
705  *      MFC cache manipulation by user space mroute daemon
706  */
707
708 static int ipmr_mfc_delete(struct mfcctl *mfc)
709 {
710         int line;
711         struct mfc_cache *c, **cp;
712
713         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714
715         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
718                         write_lock_bh(&mrt_lock);
719                         *cp = c->next;
720                         write_unlock_bh(&mrt_lock);
721
722                         kmem_cache_free(mrt_cachep, c);
723                         return 0;
724                 }
725         }
726         return -ENOENT;
727 }
728
729 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
730 {
731         int line;
732         struct mfc_cache *uc, *c, **cp;
733
734         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
735
736         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
737                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
738                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
739                         break;
740         }
741
742         if (c != NULL) {
743                 write_lock_bh(&mrt_lock);
744                 c->mfc_parent = mfc->mfcc_parent;
745                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
746                 if (!mrtsock)
747                         c->mfc_flags |= MFC_STATIC;
748                 write_unlock_bh(&mrt_lock);
749                 return 0;
750         }
751
752         if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
753                 return -EINVAL;
754
755         c=ipmr_cache_alloc();
756         if (c==NULL)
757                 return -ENOMEM;
758
759         c->mfc_origin=mfc->mfcc_origin.s_addr;
760         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
761         c->mfc_parent=mfc->mfcc_parent;
762         ipmr_update_thresholds(c, mfc->mfcc_ttls);
763         if (!mrtsock)
764                 c->mfc_flags |= MFC_STATIC;
765
766         write_lock_bh(&mrt_lock);
767         c->next = mfc_cache_array[line];
768         mfc_cache_array[line] = c;
769         write_unlock_bh(&mrt_lock);
770
771         /*
772          *      Check to see if we resolved a queued list. If so we
773          *      need to send on the frames and tidy up.
774          */
775         spin_lock_bh(&mfc_unres_lock);
776         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
777              cp = &uc->next) {
778                 if (uc->mfc_origin == c->mfc_origin &&
779                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
780                         *cp = uc->next;
781                         if (atomic_dec_and_test(&cache_resolve_queue_len))
782                                 del_timer(&ipmr_expire_timer);
783                         break;
784                 }
785         }
786         spin_unlock_bh(&mfc_unres_lock);
787
788         if (uc) {
789                 ipmr_cache_resolve(uc, c);
790                 kmem_cache_free(mrt_cachep, uc);
791         }
792         return 0;
793 }
794
795 /*
796  *      Close the multicast socket, and clear the vif tables etc
797  */
798
799 static void mroute_clean_tables(struct sock *sk)
800 {
801         int i;
802
803         /*
804          *      Shut down all active vif entries
805          */
806         for (i=0; i<maxvif; i++) {
807                 if (!(vif_table[i].flags&VIFF_STATIC))
808                         vif_delete(i);
809         }
810
811         /*
812          *      Wipe the cache
813          */
814         for (i=0;i<MFC_LINES;i++) {
815                 struct mfc_cache *c, **cp;
816
817                 cp = &mfc_cache_array[i];
818                 while ((c = *cp) != NULL) {
819                         if (c->mfc_flags&MFC_STATIC) {
820                                 cp = &c->next;
821                                 continue;
822                         }
823                         write_lock_bh(&mrt_lock);
824                         *cp = c->next;
825                         write_unlock_bh(&mrt_lock);
826
827                         kmem_cache_free(mrt_cachep, c);
828                 }
829         }
830
831         if (atomic_read(&cache_resolve_queue_len) != 0) {
832                 struct mfc_cache *c;
833
834                 spin_lock_bh(&mfc_unres_lock);
835                 while (mfc_unres_queue != NULL) {
836                         c = mfc_unres_queue;
837                         mfc_unres_queue = c->next;
838                         spin_unlock_bh(&mfc_unres_lock);
839
840                         ipmr_destroy_unres(c);
841
842                         spin_lock_bh(&mfc_unres_lock);
843                 }
844                 spin_unlock_bh(&mfc_unres_lock);
845         }
846 }
847
848 static void mrtsock_destruct(struct sock *sk)
849 {
850         rtnl_lock();
851         if (sk == mroute_socket) {
852                 IPV4_DEVCONF_ALL(MC_FORWARDING)--;
853
854                 write_lock_bh(&mrt_lock);
855                 mroute_socket=NULL;
856                 write_unlock_bh(&mrt_lock);
857
858                 mroute_clean_tables(sk);
859         }
860         rtnl_unlock();
861 }
862
863 /*
864  *      Socket options and virtual interface manipulation. The whole
865  *      virtual interface system is a complete heap, but unfortunately
866  *      that's how BSD mrouted happens to think. Maybe one day with a proper
867  *      MOSPF/PIM router set up we can clean this up.
868  */
869
870 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
871 {
872         int ret;
873         struct vifctl vif;
874         struct mfcctl mfc;
875
876         if (optname != MRT_INIT) {
877                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
878                         return -EACCES;
879         }
880
881         switch (optname) {
882         case MRT_INIT:
883                 if (sk->sk_type != SOCK_RAW ||
884                     inet_sk(sk)->num != IPPROTO_IGMP)
885                         return -EOPNOTSUPP;
886                 if (optlen!=sizeof(int))
887                         return -ENOPROTOOPT;
888
889                 rtnl_lock();
890                 if (mroute_socket) {
891                         rtnl_unlock();
892                         return -EADDRINUSE;
893                 }
894
895                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
896                 if (ret == 0) {
897                         write_lock_bh(&mrt_lock);
898                         mroute_socket=sk;
899                         write_unlock_bh(&mrt_lock);
900
901                         IPV4_DEVCONF_ALL(MC_FORWARDING)++;
902                 }
903                 rtnl_unlock();
904                 return ret;
905         case MRT_DONE:
906                 if (sk!=mroute_socket)
907                         return -EACCES;
908                 return ip_ra_control(sk, 0, NULL);
909         case MRT_ADD_VIF:
910         case MRT_DEL_VIF:
911                 if (optlen!=sizeof(vif))
912                         return -EINVAL;
913                 if (copy_from_user(&vif,optval,sizeof(vif)))
914                         return -EFAULT;
915                 if (vif.vifc_vifi >= MAXVIFS)
916                         return -ENFILE;
917                 rtnl_lock();
918                 if (optname==MRT_ADD_VIF) {
919                         ret = vif_add(&vif, sk==mroute_socket);
920                 } else {
921                         ret = vif_delete(vif.vifc_vifi);
922                 }
923                 rtnl_unlock();
924                 return ret;
925
926                 /*
927                  *      Manipulate the forwarding caches. These live
928                  *      in a sort of kernel/user symbiosis.
929                  */
930         case MRT_ADD_MFC:
931         case MRT_DEL_MFC:
932                 if (optlen!=sizeof(mfc))
933                         return -EINVAL;
934                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
935                         return -EFAULT;
936                 rtnl_lock();
937                 if (optname==MRT_DEL_MFC)
938                         ret = ipmr_mfc_delete(&mfc);
939                 else
940                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
941                 rtnl_unlock();
942                 return ret;
943                 /*
944                  *      Control PIM assert.
945                  */
946         case MRT_ASSERT:
947         {
948                 int v;
949                 if (get_user(v,(int __user *)optval))
950                         return -EFAULT;
951                 mroute_do_assert=(v)?1:0;
952                 return 0;
953         }
954 #ifdef CONFIG_IP_PIMSM
955         case MRT_PIM:
956         {
957                 int v, ret;
958                 if (get_user(v,(int __user *)optval))
959                         return -EFAULT;
960                 v = (v)?1:0;
961                 rtnl_lock();
962                 ret = 0;
963                 if (v != mroute_do_pim) {
964                         mroute_do_pim = v;
965                         mroute_do_assert = v;
966 #ifdef CONFIG_IP_PIMSM_V2
967                         if (mroute_do_pim)
968                                 ret = inet_add_protocol(&pim_protocol,
969                                                         IPPROTO_PIM);
970                         else
971                                 ret = inet_del_protocol(&pim_protocol,
972                                                         IPPROTO_PIM);
973                         if (ret < 0)
974                                 ret = -EAGAIN;
975 #endif
976                 }
977                 rtnl_unlock();
978                 return ret;
979         }
980 #endif
981         /*
982          *      Spurious command, or MRT_VERSION which you cannot
983          *      set.
984          */
985         default:
986                 return -ENOPROTOOPT;
987         }
988 }
989
990 /*
991  *      Getsock opt support for the multicast routing system.
992  */
993
994 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
995 {
996         int olr;
997         int val;
998
999         if (optname!=MRT_VERSION &&
1000 #ifdef CONFIG_IP_PIMSM
1001            optname!=MRT_PIM &&
1002 #endif
1003            optname!=MRT_ASSERT)
1004                 return -ENOPROTOOPT;
1005
1006         if (get_user(olr, optlen))
1007                 return -EFAULT;
1008
1009         olr = min_t(unsigned int, olr, sizeof(int));
1010         if (olr < 0)
1011                 return -EINVAL;
1012
1013         if (put_user(olr,optlen))
1014                 return -EFAULT;
1015         if (optname==MRT_VERSION)
1016                 val=0x0305;
1017 #ifdef CONFIG_IP_PIMSM
1018         else if (optname==MRT_PIM)
1019                 val=mroute_do_pim;
1020 #endif
1021         else
1022                 val=mroute_do_assert;
1023         if (copy_to_user(optval,&val,olr))
1024                 return -EFAULT;
1025         return 0;
1026 }
1027
1028 /*
1029  *      The IP multicast ioctl support routines.
1030  */
1031
1032 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1033 {
1034         struct sioc_sg_req sr;
1035         struct sioc_vif_req vr;
1036         struct vif_device *vif;
1037         struct mfc_cache *c;
1038
1039         switch (cmd) {
1040         case SIOCGETVIFCNT:
1041                 if (copy_from_user(&vr,arg,sizeof(vr)))
1042                         return -EFAULT;
1043                 if (vr.vifi>=maxvif)
1044                         return -EINVAL;
1045                 read_lock(&mrt_lock);
1046                 vif=&vif_table[vr.vifi];
1047                 if (VIF_EXISTS(vr.vifi))        {
1048                         vr.icount=vif->pkt_in;
1049                         vr.ocount=vif->pkt_out;
1050                         vr.ibytes=vif->bytes_in;
1051                         vr.obytes=vif->bytes_out;
1052                         read_unlock(&mrt_lock);
1053
1054                         if (copy_to_user(arg,&vr,sizeof(vr)))
1055                                 return -EFAULT;
1056                         return 0;
1057                 }
1058                 read_unlock(&mrt_lock);
1059                 return -EADDRNOTAVAIL;
1060         case SIOCGETSGCNT:
1061                 if (copy_from_user(&sr,arg,sizeof(sr)))
1062                         return -EFAULT;
1063
1064                 read_lock(&mrt_lock);
1065                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1066                 if (c) {
1067                         sr.pktcnt = c->mfc_un.res.pkt;
1068                         sr.bytecnt = c->mfc_un.res.bytes;
1069                         sr.wrong_if = c->mfc_un.res.wrong_if;
1070                         read_unlock(&mrt_lock);
1071
1072                         if (copy_to_user(arg,&sr,sizeof(sr)))
1073                                 return -EFAULT;
1074                         return 0;
1075                 }
1076                 read_unlock(&mrt_lock);
1077                 return -EADDRNOTAVAIL;
1078         default:
1079                 return -ENOIOCTLCMD;
1080         }
1081 }
1082
1083
1084 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1085 {
1086         struct net_device *dev = ptr;
1087         struct vif_device *v;
1088         int ct;
1089
1090         if (dev->nd_net != &init_net)
1091                 return NOTIFY_DONE;
1092
1093         if (event != NETDEV_UNREGISTER)
1094                 return NOTIFY_DONE;
1095         v=&vif_table[0];
1096         for (ct=0;ct<maxvif;ct++,v++) {
1097                 if (v->dev==dev)
1098                         vif_delete(ct);
1099         }
1100         return NOTIFY_DONE;
1101 }
1102
1103
1104 static struct notifier_block ip_mr_notifier={
1105         .notifier_call = ipmr_device_event,
1106 };
1107
1108 /*
1109  *      Encapsulate a packet by attaching a valid IPIP header to it.
1110  *      This avoids tunnel drivers and other mess and gives us the speed so
1111  *      important for multicast video.
1112  */
1113
1114 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1115 {
1116         struct iphdr *iph;
1117         struct iphdr *old_iph = ip_hdr(skb);
1118
1119         skb_push(skb, sizeof(struct iphdr));
1120         skb->transport_header = skb->network_header;
1121         skb_reset_network_header(skb);
1122         iph = ip_hdr(skb);
1123
1124         iph->version    =       4;
1125         iph->tos        =       old_iph->tos;
1126         iph->ttl        =       old_iph->ttl;
1127         iph->frag_off   =       0;
1128         iph->daddr      =       daddr;
1129         iph->saddr      =       saddr;
1130         iph->protocol   =       IPPROTO_IPIP;
1131         iph->ihl        =       5;
1132         iph->tot_len    =       htons(skb->len);
1133         ip_select_ident(iph, skb->dst, NULL);
1134         ip_send_check(iph);
1135
1136         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1137         nf_reset(skb);
1138 }
1139
1140 static inline int ipmr_forward_finish(struct sk_buff *skb)
1141 {
1142         struct ip_options * opt = &(IPCB(skb)->opt);
1143
1144         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1145
1146         if (unlikely(opt->optlen))
1147                 ip_forward_options(skb);
1148
1149         return dst_output(skb);
1150 }
1151
1152 /*
1153  *      Processing handlers for ipmr_forward
1154  */
1155
1156 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1157 {
1158         const struct iphdr *iph = ip_hdr(skb);
1159         struct vif_device *vif = &vif_table[vifi];
1160         struct net_device *dev;
1161         struct rtable *rt;
1162         int    encap = 0;
1163
1164         if (vif->dev == NULL)
1165                 goto out_free;
1166
1167 #ifdef CONFIG_IP_PIMSM
1168         if (vif->flags & VIFF_REGISTER) {
1169                 vif->pkt_out++;
1170                 vif->bytes_out+=skb->len;
1171                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1172                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1173                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1174                 kfree_skb(skb);
1175                 return;
1176         }
1177 #endif
1178
1179         if (vif->flags&VIFF_TUNNEL) {
1180                 struct flowi fl = { .oif = vif->link,
1181                                     .nl_u = { .ip4_u =
1182                                               { .daddr = vif->remote,
1183                                                 .saddr = vif->local,
1184                                                 .tos = RT_TOS(iph->tos) } },
1185                                     .proto = IPPROTO_IPIP };
1186                 if (ip_route_output_key(&rt, &fl))
1187                         goto out_free;
1188                 encap = sizeof(struct iphdr);
1189         } else {
1190                 struct flowi fl = { .oif = vif->link,
1191                                     .nl_u = { .ip4_u =
1192                                               { .daddr = iph->daddr,
1193                                                 .tos = RT_TOS(iph->tos) } },
1194                                     .proto = IPPROTO_IPIP };
1195                 if (ip_route_output_key(&rt, &fl))
1196                         goto out_free;
1197         }
1198
1199         dev = rt->u.dst.dev;
1200
1201         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1202                 /* Do not fragment multicasts. Alas, IPv4 does not
1203                    allow to send ICMP, so that packets will disappear
1204                    to blackhole.
1205                  */
1206
1207                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1208                 ip_rt_put(rt);
1209                 goto out_free;
1210         }
1211
1212         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1213
1214         if (skb_cow(skb, encap)) {
1215                 ip_rt_put(rt);
1216                 goto out_free;
1217         }
1218
1219         vif->pkt_out++;
1220         vif->bytes_out+=skb->len;
1221
1222         dst_release(skb->dst);
1223         skb->dst = &rt->u.dst;
1224         ip_decrease_ttl(ip_hdr(skb));
1225
1226         /* FIXME: forward and output firewalls used to be called here.
1227          * What do we do with netfilter? -- RR */
1228         if (vif->flags & VIFF_TUNNEL) {
1229                 ip_encap(skb, vif->local, vif->remote);
1230                 /* FIXME: extra output firewall step used to be here. --RR */
1231                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1232                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1233         }
1234
1235         IPCB(skb)->flags |= IPSKB_FORWARDED;
1236
1237         /*
1238          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1239          * not only before forwarding, but after forwarding on all output
1240          * interfaces. It is clear, if mrouter runs a multicasting
1241          * program, it should receive packets not depending to what interface
1242          * program is joined.
1243          * If we will not make it, the program will have to join on all
1244          * interfaces. On the other hand, multihoming host (or router, but
1245          * not mrouter) cannot join to more than one interface - it will
1246          * result in receiving multiple packets.
1247          */
1248         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1249                 ipmr_forward_finish);
1250         return;
1251
1252 out_free:
1253         kfree_skb(skb);
1254         return;
1255 }
1256
1257 static int ipmr_find_vif(struct net_device *dev)
1258 {
1259         int ct;
1260         for (ct=maxvif-1; ct>=0; ct--) {
1261                 if (vif_table[ct].dev == dev)
1262                         break;
1263         }
1264         return ct;
1265 }
1266
1267 /* "local" means that we should preserve one skb (for local delivery) */
1268
1269 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1270 {
1271         int psend = -1;
1272         int vif, ct;
1273
1274         vif = cache->mfc_parent;
1275         cache->mfc_un.res.pkt++;
1276         cache->mfc_un.res.bytes += skb->len;
1277
1278         /*
1279          * Wrong interface: drop packet and (maybe) send PIM assert.
1280          */
1281         if (vif_table[vif].dev != skb->dev) {
1282                 int true_vifi;
1283
1284                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1285                         /* It is our own packet, looped back.
1286                            Very complicated situation...
1287
1288                            The best workaround until routing daemons will be
1289                            fixed is not to redistribute packet, if it was
1290                            send through wrong interface. It means, that
1291                            multicast applications WILL NOT work for
1292                            (S,G), which have default multicast route pointing
1293                            to wrong oif. In any case, it is not a good
1294                            idea to use multicasting applications on router.
1295                          */
1296                         goto dont_forward;
1297                 }
1298
1299                 cache->mfc_un.res.wrong_if++;
1300                 true_vifi = ipmr_find_vif(skb->dev);
1301
1302                 if (true_vifi >= 0 && mroute_do_assert &&
1303                     /* pimsm uses asserts, when switching from RPT to SPT,
1304                        so that we cannot check that packet arrived on an oif.
1305                        It is bad, but otherwise we would need to move pretty
1306                        large chunk of pimd to kernel. Ough... --ANK
1307                      */
1308                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1309                     time_after(jiffies,
1310                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1311                         cache->mfc_un.res.last_assert = jiffies;
1312                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1313                 }
1314                 goto dont_forward;
1315         }
1316
1317         vif_table[vif].pkt_in++;
1318         vif_table[vif].bytes_in+=skb->len;
1319
1320         /*
1321          *      Forward the frame
1322          */
1323         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1324                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1325                         if (psend != -1) {
1326                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1327                                 if (skb2)
1328                                         ipmr_queue_xmit(skb2, cache, psend);
1329                         }
1330                         psend=ct;
1331                 }
1332         }
1333         if (psend != -1) {
1334                 if (local) {
1335                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1336                         if (skb2)
1337                                 ipmr_queue_xmit(skb2, cache, psend);
1338                 } else {
1339                         ipmr_queue_xmit(skb, cache, psend);
1340                         return 0;
1341                 }
1342         }
1343
1344 dont_forward:
1345         if (!local)
1346                 kfree_skb(skb);
1347         return 0;
1348 }
1349
1350
1351 /*
1352  *      Multicast packets for forwarding arrive here
1353  */
1354
1355 int ip_mr_input(struct sk_buff *skb)
1356 {
1357         struct mfc_cache *cache;
1358         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1359
1360         /* Packet is looped back after forward, it should not be
1361            forwarded second time, but still can be delivered locally.
1362          */
1363         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1364                 goto dont_forward;
1365
1366         if (!local) {
1367                     if (IPCB(skb)->opt.router_alert) {
1368                             if (ip_call_ra_chain(skb))
1369                                     return 0;
1370                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1371                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1372                                Cisco IOS <= 11.2(8)) do not put router alert
1373                                option to IGMP packets destined to routable
1374                                groups. It is very bad, because it means
1375                                that we can forward NO IGMP messages.
1376                              */
1377                             read_lock(&mrt_lock);
1378                             if (mroute_socket) {
1379                                     nf_reset(skb);
1380                                     raw_rcv(mroute_socket, skb);
1381                                     read_unlock(&mrt_lock);
1382                                     return 0;
1383                             }
1384                             read_unlock(&mrt_lock);
1385                     }
1386         }
1387
1388         read_lock(&mrt_lock);
1389         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1390
1391         /*
1392          *      No usable cache entry
1393          */
1394         if (cache==NULL) {
1395                 int vif;
1396
1397                 if (local) {
1398                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1399                         ip_local_deliver(skb);
1400                         if (skb2 == NULL) {
1401                                 read_unlock(&mrt_lock);
1402                                 return -ENOBUFS;
1403                         }
1404                         skb = skb2;
1405                 }
1406
1407                 vif = ipmr_find_vif(skb->dev);
1408                 if (vif >= 0) {
1409                         int err = ipmr_cache_unresolved(vif, skb);
1410                         read_unlock(&mrt_lock);
1411
1412                         return err;
1413                 }
1414                 read_unlock(&mrt_lock);
1415                 kfree_skb(skb);
1416                 return -ENODEV;
1417         }
1418
1419         ip_mr_forward(skb, cache, local);
1420
1421         read_unlock(&mrt_lock);
1422
1423         if (local)
1424                 return ip_local_deliver(skb);
1425
1426         return 0;
1427
1428 dont_forward:
1429         if (local)
1430                 return ip_local_deliver(skb);
1431         kfree_skb(skb);
1432         return 0;
1433 }
1434
1435 #ifdef CONFIG_IP_PIMSM_V1
1436 /*
1437  * Handle IGMP messages of PIMv1
1438  */
1439
1440 int pim_rcv_v1(struct sk_buff * skb)
1441 {
1442         struct igmphdr *pim;
1443         struct iphdr   *encap;
1444         struct net_device  *reg_dev = NULL;
1445
1446         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1447                 goto drop;
1448
1449         pim = igmp_hdr(skb);
1450
1451         if (!mroute_do_pim ||
1452             skb->len < sizeof(*pim) + sizeof(*encap) ||
1453             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1454                 goto drop;
1455
1456         encap = (struct iphdr *)(skb_transport_header(skb) +
1457                                  sizeof(struct igmphdr));
1458         /*
1459            Check that:
1460            a. packet is really destinted to a multicast group
1461            b. packet is not a NULL-REGISTER
1462            c. packet is not truncated
1463          */
1464         if (!MULTICAST(encap->daddr) ||
1465             encap->tot_len == 0 ||
1466             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1467                 goto drop;
1468
1469         read_lock(&mrt_lock);
1470         if (reg_vif_num >= 0)
1471                 reg_dev = vif_table[reg_vif_num].dev;
1472         if (reg_dev)
1473                 dev_hold(reg_dev);
1474         read_unlock(&mrt_lock);
1475
1476         if (reg_dev == NULL)
1477                 goto drop;
1478
1479         skb->mac_header = skb->network_header;
1480         skb_pull(skb, (u8*)encap - skb->data);
1481         skb_reset_network_header(skb);
1482         skb->dev = reg_dev;
1483         skb->protocol = htons(ETH_P_IP);
1484         skb->ip_summed = 0;
1485         skb->pkt_type = PACKET_HOST;
1486         dst_release(skb->dst);
1487         skb->dst = NULL;
1488         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1489         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1490         nf_reset(skb);
1491         netif_rx(skb);
1492         dev_put(reg_dev);
1493         return 0;
1494  drop:
1495         kfree_skb(skb);
1496         return 0;
1497 }
1498 #endif
1499
1500 #ifdef CONFIG_IP_PIMSM_V2
1501 static int pim_rcv(struct sk_buff * skb)
1502 {
1503         struct pimreghdr *pim;
1504         struct iphdr   *encap;
1505         struct net_device  *reg_dev = NULL;
1506
1507         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1508                 goto drop;
1509
1510         pim = (struct pimreghdr *)skb_transport_header(skb);
1511         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1512             (pim->flags&PIM_NULL_REGISTER) ||
1513             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1514              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1515                 goto drop;
1516
1517         /* check if the inner packet is destined to mcast group */
1518         encap = (struct iphdr *)(skb_transport_header(skb) +
1519                                  sizeof(struct pimreghdr));
1520         if (!MULTICAST(encap->daddr) ||
1521             encap->tot_len == 0 ||
1522             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1523                 goto drop;
1524
1525         read_lock(&mrt_lock);
1526         if (reg_vif_num >= 0)
1527                 reg_dev = vif_table[reg_vif_num].dev;
1528         if (reg_dev)
1529                 dev_hold(reg_dev);
1530         read_unlock(&mrt_lock);
1531
1532         if (reg_dev == NULL)
1533                 goto drop;
1534
1535         skb->mac_header = skb->network_header;
1536         skb_pull(skb, (u8*)encap - skb->data);
1537         skb_reset_network_header(skb);
1538         skb->dev = reg_dev;
1539         skb->protocol = htons(ETH_P_IP);
1540         skb->ip_summed = 0;
1541         skb->pkt_type = PACKET_HOST;
1542         dst_release(skb->dst);
1543         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1544         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1545         skb->dst = NULL;
1546         nf_reset(skb);
1547         netif_rx(skb);
1548         dev_put(reg_dev);
1549         return 0;
1550  drop:
1551         kfree_skb(skb);
1552         return 0;
1553 }
1554 #endif
1555
1556 static int
1557 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1558 {
1559         int ct;
1560         struct rtnexthop *nhp;
1561         struct net_device *dev = vif_table[c->mfc_parent].dev;
1562         u8 *b = skb_tail_pointer(skb);
1563         struct rtattr *mp_head;
1564
1565         if (dev)
1566                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1567
1568         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1569
1570         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1571                 if (c->mfc_un.res.ttls[ct] < 255) {
1572                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1573                                 goto rtattr_failure;
1574                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1575                         nhp->rtnh_flags = 0;
1576                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1577                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1578                         nhp->rtnh_len = sizeof(*nhp);
1579                 }
1580         }
1581         mp_head->rta_type = RTA_MULTIPATH;
1582         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1583         rtm->rtm_type = RTN_MULTICAST;
1584         return 1;
1585
1586 rtattr_failure:
1587         nlmsg_trim(skb, b);
1588         return -EMSGSIZE;
1589 }
1590
1591 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1592 {
1593         int err;
1594         struct mfc_cache *cache;
1595         struct rtable *rt = (struct rtable*)skb->dst;
1596
1597         read_lock(&mrt_lock);
1598         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1599
1600         if (cache==NULL) {
1601                 struct sk_buff *skb2;
1602                 struct iphdr *iph;
1603                 struct net_device *dev;
1604                 int vif;
1605
1606                 if (nowait) {
1607                         read_unlock(&mrt_lock);
1608                         return -EAGAIN;
1609                 }
1610
1611                 dev = skb->dev;
1612                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1613                         read_unlock(&mrt_lock);
1614                         return -ENODEV;
1615                 }
1616                 skb2 = skb_clone(skb, GFP_ATOMIC);
1617                 if (!skb2) {
1618                         read_unlock(&mrt_lock);
1619                         return -ENOMEM;
1620                 }
1621
1622                 skb_push(skb2, sizeof(struct iphdr));
1623                 skb_reset_network_header(skb2);
1624                 iph = ip_hdr(skb2);
1625                 iph->ihl = sizeof(struct iphdr) >> 2;
1626                 iph->saddr = rt->rt_src;
1627                 iph->daddr = rt->rt_dst;
1628                 iph->version = 0;
1629                 err = ipmr_cache_unresolved(vif, skb2);
1630                 read_unlock(&mrt_lock);
1631                 return err;
1632         }
1633
1634         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1635                 cache->mfc_flags |= MFC_NOTIFY;
1636         err = ipmr_fill_mroute(skb, cache, rtm);
1637         read_unlock(&mrt_lock);
1638         return err;
1639 }
1640
1641 #ifdef CONFIG_PROC_FS
1642 /*
1643  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1644  */
1645 struct ipmr_vif_iter {
1646         int ct;
1647 };
1648
1649 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1650                                            loff_t pos)
1651 {
1652         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1653                 if (!VIF_EXISTS(iter->ct))
1654                         continue;
1655                 if (pos-- == 0)
1656                         return &vif_table[iter->ct];
1657         }
1658         return NULL;
1659 }
1660
1661 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1662 {
1663         read_lock(&mrt_lock);
1664         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1665                 : SEQ_START_TOKEN;
1666 }
1667
1668 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1669 {
1670         struct ipmr_vif_iter *iter = seq->private;
1671
1672         ++*pos;
1673         if (v == SEQ_START_TOKEN)
1674                 return ipmr_vif_seq_idx(iter, 0);
1675
1676         while (++iter->ct < maxvif) {
1677                 if (!VIF_EXISTS(iter->ct))
1678                         continue;
1679                 return &vif_table[iter->ct];
1680         }
1681         return NULL;
1682 }
1683
1684 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1685 {
1686         read_unlock(&mrt_lock);
1687 }
1688
1689 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1690 {
1691         if (v == SEQ_START_TOKEN) {
1692                 seq_puts(seq,
1693                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1694         } else {
1695                 const struct vif_device *vif = v;
1696                 const char *name =  vif->dev ? vif->dev->name : "none";
1697
1698                 seq_printf(seq,
1699                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1700                            vif - vif_table,
1701                            name, vif->bytes_in, vif->pkt_in,
1702                            vif->bytes_out, vif->pkt_out,
1703                            vif->flags, vif->local, vif->remote);
1704         }
1705         return 0;
1706 }
1707
1708 static const struct seq_operations ipmr_vif_seq_ops = {
1709         .start = ipmr_vif_seq_start,
1710         .next  = ipmr_vif_seq_next,
1711         .stop  = ipmr_vif_seq_stop,
1712         .show  = ipmr_vif_seq_show,
1713 };
1714
1715 static int ipmr_vif_open(struct inode *inode, struct file *file)
1716 {
1717         return seq_open_private(file, &ipmr_vif_seq_ops,
1718                         sizeof(struct ipmr_vif_iter));
1719 }
1720
1721 static const struct file_operations ipmr_vif_fops = {
1722         .owner   = THIS_MODULE,
1723         .open    = ipmr_vif_open,
1724         .read    = seq_read,
1725         .llseek  = seq_lseek,
1726         .release = seq_release_private,
1727 };
1728
1729 struct ipmr_mfc_iter {
1730         struct mfc_cache **cache;
1731         int ct;
1732 };
1733
1734
1735 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1736 {
1737         struct mfc_cache *mfc;
1738
1739         it->cache = mfc_cache_array;
1740         read_lock(&mrt_lock);
1741         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1742                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1743                         if (pos-- == 0)
1744                                 return mfc;
1745         read_unlock(&mrt_lock);
1746
1747         it->cache = &mfc_unres_queue;
1748         spin_lock_bh(&mfc_unres_lock);
1749         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1750                 if (pos-- == 0)
1751                         return mfc;
1752         spin_unlock_bh(&mfc_unres_lock);
1753
1754         it->cache = NULL;
1755         return NULL;
1756 }
1757
1758
1759 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1760 {
1761         struct ipmr_mfc_iter *it = seq->private;
1762         it->cache = NULL;
1763         it->ct = 0;
1764         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1765                 : SEQ_START_TOKEN;
1766 }
1767
1768 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1769 {
1770         struct mfc_cache *mfc = v;
1771         struct ipmr_mfc_iter *it = seq->private;
1772
1773         ++*pos;
1774
1775         if (v == SEQ_START_TOKEN)
1776                 return ipmr_mfc_seq_idx(seq->private, 0);
1777
1778         if (mfc->next)
1779                 return mfc->next;
1780
1781         if (it->cache == &mfc_unres_queue)
1782                 goto end_of_list;
1783
1784         BUG_ON(it->cache != mfc_cache_array);
1785
1786         while (++it->ct < MFC_LINES) {
1787                 mfc = mfc_cache_array[it->ct];
1788                 if (mfc)
1789                         return mfc;
1790         }
1791
1792         /* exhausted cache_array, show unresolved */
1793         read_unlock(&mrt_lock);
1794         it->cache = &mfc_unres_queue;
1795         it->ct = 0;
1796
1797         spin_lock_bh(&mfc_unres_lock);
1798         mfc = mfc_unres_queue;
1799         if (mfc)
1800                 return mfc;
1801
1802  end_of_list:
1803         spin_unlock_bh(&mfc_unres_lock);
1804         it->cache = NULL;
1805
1806         return NULL;
1807 }
1808
1809 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1810 {
1811         struct ipmr_mfc_iter *it = seq->private;
1812
1813         if (it->cache == &mfc_unres_queue)
1814                 spin_unlock_bh(&mfc_unres_lock);
1815         else if (it->cache == mfc_cache_array)
1816                 read_unlock(&mrt_lock);
1817 }
1818
1819 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1820 {
1821         int n;
1822
1823         if (v == SEQ_START_TOKEN) {
1824                 seq_puts(seq,
1825                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1826         } else {
1827                 const struct mfc_cache *mfc = v;
1828                 const struct ipmr_mfc_iter *it = seq->private;
1829
1830                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1831                            (unsigned long) mfc->mfc_mcastgrp,
1832                            (unsigned long) mfc->mfc_origin,
1833                            mfc->mfc_parent,
1834                            mfc->mfc_un.res.pkt,
1835                            mfc->mfc_un.res.bytes,
1836                            mfc->mfc_un.res.wrong_if);
1837
1838                 if (it->cache != &mfc_unres_queue) {
1839                         for (n = mfc->mfc_un.res.minvif;
1840                              n < mfc->mfc_un.res.maxvif; n++ ) {
1841                                 if (VIF_EXISTS(n)
1842                                    && mfc->mfc_un.res.ttls[n] < 255)
1843                                 seq_printf(seq,
1844                                            " %2d:%-3d",
1845                                            n, mfc->mfc_un.res.ttls[n]);
1846                         }
1847                 }
1848                 seq_putc(seq, '\n');
1849         }
1850         return 0;
1851 }
1852
1853 static const struct seq_operations ipmr_mfc_seq_ops = {
1854         .start = ipmr_mfc_seq_start,
1855         .next  = ipmr_mfc_seq_next,
1856         .stop  = ipmr_mfc_seq_stop,
1857         .show  = ipmr_mfc_seq_show,
1858 };
1859
1860 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1861 {
1862         return seq_open_private(file, &ipmr_mfc_seq_ops,
1863                         sizeof(struct ipmr_mfc_iter));
1864 }
1865
1866 static const struct file_operations ipmr_mfc_fops = {
1867         .owner   = THIS_MODULE,
1868         .open    = ipmr_mfc_open,
1869         .read    = seq_read,
1870         .llseek  = seq_lseek,
1871         .release = seq_release_private,
1872 };
1873 #endif
1874
1875 #ifdef CONFIG_IP_PIMSM_V2
1876 static struct net_protocol pim_protocol = {
1877         .handler        =       pim_rcv,
1878 };
1879 #endif
1880
1881
1882 /*
1883  *      Setup for IP multicast routing
1884  */
1885
1886 void __init ip_mr_init(void)
1887 {
1888         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1889                                        sizeof(struct mfc_cache),
1890                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1891                                        NULL);
1892         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1893         register_netdevice_notifier(&ip_mr_notifier);
1894 #ifdef CONFIG_PROC_FS
1895         proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
1896         proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
1897 #endif
1898 }