[AF_PACKET]: Allow multicast traffic to be caught by ORIGDEV when bonded
[linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* List of all packet sockets. */
139 static HLIST_HEAD(packet_sklist);
140 static DEFINE_RWLOCK(packet_sklist_lock);
141
142 /* Private packet socket structures. */
143
144 struct packet_mclist
145 {
146         struct packet_mclist    *next;
147         int                     ifindex;
148         int                     count;
149         unsigned short          type;
150         unsigned short          alen;
151         unsigned char           addr[MAX_ADDR_LEN];
152 };
153 /* identical to struct packet_mreq except it has
154  * a longer address field.
155  */
156 struct packet_mreq_max
157 {
158         int             mr_ifindex;
159         unsigned short  mr_type;
160         unsigned short  mr_alen;
161         unsigned char   mr_address[MAX_ADDR_LEN];
162 };
163
164 #ifdef CONFIG_PACKET_MMAP
165 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
166 #endif
167
168 static void packet_flush_mclist(struct sock *sk);
169
170 struct packet_sock {
171         /* struct sock has to be the first member of packet_sock */
172         struct sock             sk;
173         struct tpacket_stats    stats;
174 #ifdef CONFIG_PACKET_MMAP
175         char *                  *pg_vec;
176         unsigned int            head;
177         unsigned int            frames_per_block;
178         unsigned int            frame_size;
179         unsigned int            frame_max;
180         int                     copy_thresh;
181 #endif
182         struct packet_type      prot_hook;
183         spinlock_t              bind_lock;
184         unsigned int            running:1,      /* prot_hook is attached*/
185                                 auxdata:1,
186                                 origdev:1;
187         int                     ifindex;        /* bound device         */
188         __be16                  num;
189         struct packet_mclist    *mclist;
190 #ifdef CONFIG_PACKET_MMAP
191         atomic_t                mapped;
192         unsigned int            pg_vec_order;
193         unsigned int            pg_vec_pages;
194         unsigned int            pg_vec_len;
195 #endif
196 };
197
198 struct packet_skb_cb {
199         unsigned int origlen;
200         union {
201                 struct sockaddr_pkt pkt;
202                 struct sockaddr_ll ll;
203         } sa;
204 };
205
206 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
207
208 #ifdef CONFIG_PACKET_MMAP
209
210 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
211 {
212         unsigned int pg_vec_pos, frame_offset;
213
214         pg_vec_pos = position / po->frames_per_block;
215         frame_offset = position % po->frames_per_block;
216
217         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
218 }
219 #endif
220
221 static inline struct packet_sock *pkt_sk(struct sock *sk)
222 {
223         return (struct packet_sock *)sk;
224 }
225
226 static void packet_sock_destruct(struct sock *sk)
227 {
228         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
229         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
230
231         if (!sock_flag(sk, SOCK_DEAD)) {
232                 printk("Attempt to release alive packet socket: %p\n", sk);
233                 return;
234         }
235
236         sk_refcnt_debug_dec(sk);
237 }
238
239
240 static const struct proto_ops packet_ops;
241
242 static const struct proto_ops packet_ops_spkt;
243
244 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
245 {
246         struct sock *sk;
247         struct sockaddr_pkt *spkt;
248
249         if (dev->nd_net != &init_net)
250                 goto out;
251
252         /*
253          *      When we registered the protocol we saved the socket in the data
254          *      field for just this event.
255          */
256
257         sk = pt->af_packet_priv;
258
259         /*
260          *      Yank back the headers [hope the device set this
261          *      right or kerboom...]
262          *
263          *      Incoming packets have ll header pulled,
264          *      push it back.
265          *
266          *      For outgoing ones skb->data == skb_mac_header(skb)
267          *      so that this procedure is noop.
268          */
269
270         if (skb->pkt_type == PACKET_LOOPBACK)
271                 goto out;
272
273         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
274                 goto oom;
275
276         /* drop any routing info */
277         dst_release(skb->dst);
278         skb->dst = NULL;
279
280         /* drop conntrack reference */
281         nf_reset(skb);
282
283         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
284
285         skb_push(skb, skb->data - skb_mac_header(skb));
286
287         /*
288          *      The SOCK_PACKET socket receives _all_ frames.
289          */
290
291         spkt->spkt_family = dev->type;
292         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
293         spkt->spkt_protocol = skb->protocol;
294
295         /*
296          *      Charge the memory to the socket. This is done specifically
297          *      to prevent sockets using all the memory up.
298          */
299
300         if (sock_queue_rcv_skb(sk,skb) == 0)
301                 return 0;
302
303 out:
304         kfree_skb(skb);
305 oom:
306         return 0;
307 }
308
309
310 /*
311  *      Output a raw packet to a device layer. This bypasses all the other
312  *      protocol layers and you must therefore supply it with a complete frame
313  */
314
315 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
316                                struct msghdr *msg, size_t len)
317 {
318         struct sock *sk = sock->sk;
319         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
320         struct sk_buff *skb;
321         struct net_device *dev;
322         __be16 proto=0;
323         int err;
324
325         /*
326          *      Get and verify the address.
327          */
328
329         if (saddr)
330         {
331                 if (msg->msg_namelen < sizeof(struct sockaddr))
332                         return(-EINVAL);
333                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
334                         proto=saddr->spkt_protocol;
335         }
336         else
337                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
338
339         /*
340          *      Find the device first to size check it
341          */
342
343         saddr->spkt_device[13] = 0;
344         dev = dev_get_by_name(&init_net, saddr->spkt_device);
345         err = -ENODEV;
346         if (dev == NULL)
347                 goto out_unlock;
348
349         err = -ENETDOWN;
350         if (!(dev->flags & IFF_UP))
351                 goto out_unlock;
352
353         /*
354          *      You may not queue a frame bigger than the mtu. This is the lowest level
355          *      raw protocol and you must do your own fragmentation at this level.
356          */
357
358         err = -EMSGSIZE;
359         if (len > dev->mtu + dev->hard_header_len)
360                 goto out_unlock;
361
362         err = -ENOBUFS;
363         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
364
365         /*
366          *      If the write buffer is full, then tough. At this level the user gets to
367          *      deal with the problem - do your own algorithmic backoffs. That's far
368          *      more flexible.
369          */
370
371         if (skb == NULL)
372                 goto out_unlock;
373
374         /*
375          *      Fill it in
376          */
377
378         /* FIXME: Save some space for broken drivers that write a
379          * hard header at transmission time by themselves. PPP is the
380          * notable one here. This should really be fixed at the driver level.
381          */
382         skb_reserve(skb, LL_RESERVED_SPACE(dev));
383         skb_reset_network_header(skb);
384
385         /* Try to align data part correctly */
386         if (dev->header_ops) {
387                 skb->data -= dev->hard_header_len;
388                 skb->tail -= dev->hard_header_len;
389                 if (len < dev->hard_header_len)
390                         skb_reset_network_header(skb);
391         }
392
393         /* Returns -EFAULT on error */
394         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
395         skb->protocol = proto;
396         skb->dev = dev;
397         skb->priority = sk->sk_priority;
398         if (err)
399                 goto out_free;
400
401         /*
402          *      Now send it
403          */
404
405         dev_queue_xmit(skb);
406         dev_put(dev);
407         return(len);
408
409 out_free:
410         kfree_skb(skb);
411 out_unlock:
412         if (dev)
413                 dev_put(dev);
414         return err;
415 }
416
417 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
418                                       unsigned int res)
419 {
420         struct sk_filter *filter;
421
422         rcu_read_lock_bh();
423         filter = rcu_dereference(sk->sk_filter);
424         if (filter != NULL)
425                 res = sk_run_filter(skb, filter->insns, filter->len);
426         rcu_read_unlock_bh();
427
428         return res;
429 }
430
431 /*
432    This function makes lazy skb cloning in hope that most of packets
433    are discarded by BPF.
434
435    Note tricky part: we DO mangle shared skb! skb->data, skb->len
436    and skb->cb are mangled. It works because (and until) packets
437    falling here are owned by current CPU. Output packets are cloned
438    by dev_queue_xmit_nit(), input packets are processed by net_bh
439    sequencially, so that if we return skb to original state on exit,
440    we will not harm anyone.
441  */
442
443 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
444 {
445         struct sock *sk;
446         struct sockaddr_ll *sll;
447         struct packet_sock *po;
448         u8 * skb_head = skb->data;
449         int skb_len = skb->len;
450         unsigned int snaplen, res;
451
452         if (dev->nd_net != &init_net)
453                 goto drop;
454
455         if (skb->pkt_type == PACKET_LOOPBACK)
456                 goto drop;
457
458         sk = pt->af_packet_priv;
459         po = pkt_sk(sk);
460
461         skb->dev = dev;
462
463         if (dev->header_ops) {
464                 /* The device has an explicit notion of ll header,
465                    exported to higher levels.
466
467                    Otherwise, the device hides datails of it frame
468                    structure, so that corresponding packet head
469                    never delivered to user.
470                  */
471                 if (sk->sk_type != SOCK_DGRAM)
472                         skb_push(skb, skb->data - skb_mac_header(skb));
473                 else if (skb->pkt_type == PACKET_OUTGOING) {
474                         /* Special case: outgoing packets have ll header at head */
475                         skb_pull(skb, skb_network_offset(skb));
476                 }
477         }
478
479         snaplen = skb->len;
480
481         res = run_filter(skb, sk, snaplen);
482         if (!res)
483                 goto drop_n_restore;
484         if (snaplen > res)
485                 snaplen = res;
486
487         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
488             (unsigned)sk->sk_rcvbuf)
489                 goto drop_n_acct;
490
491         if (skb_shared(skb)) {
492                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
493                 if (nskb == NULL)
494                         goto drop_n_acct;
495
496                 if (skb_head != skb->data) {
497                         skb->data = skb_head;
498                         skb->len = skb_len;
499                 }
500                 kfree_skb(skb);
501                 skb = nskb;
502         }
503
504         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
505                      sizeof(skb->cb));
506
507         sll = &PACKET_SKB_CB(skb)->sa.ll;
508         sll->sll_family = AF_PACKET;
509         sll->sll_hatype = dev->type;
510         sll->sll_protocol = skb->protocol;
511         sll->sll_pkttype = skb->pkt_type;
512         if (unlikely(po->origdev))
513                 sll->sll_ifindex = orig_dev->ifindex;
514         else
515                 sll->sll_ifindex = dev->ifindex;
516
517         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
518
519         PACKET_SKB_CB(skb)->origlen = skb->len;
520
521         if (pskb_trim(skb, snaplen))
522                 goto drop_n_acct;
523
524         skb_set_owner_r(skb, sk);
525         skb->dev = NULL;
526         dst_release(skb->dst);
527         skb->dst = NULL;
528
529         /* drop conntrack reference */
530         nf_reset(skb);
531
532         spin_lock(&sk->sk_receive_queue.lock);
533         po->stats.tp_packets++;
534         __skb_queue_tail(&sk->sk_receive_queue, skb);
535         spin_unlock(&sk->sk_receive_queue.lock);
536         sk->sk_data_ready(sk, skb->len);
537         return 0;
538
539 drop_n_acct:
540         spin_lock(&sk->sk_receive_queue.lock);
541         po->stats.tp_drops++;
542         spin_unlock(&sk->sk_receive_queue.lock);
543
544 drop_n_restore:
545         if (skb_head != skb->data && skb_shared(skb)) {
546                 skb->data = skb_head;
547                 skb->len = skb_len;
548         }
549 drop:
550         kfree_skb(skb);
551         return 0;
552 }
553
554 #ifdef CONFIG_PACKET_MMAP
555 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
556 {
557         struct sock *sk;
558         struct packet_sock *po;
559         struct sockaddr_ll *sll;
560         struct tpacket_hdr *h;
561         u8 * skb_head = skb->data;
562         int skb_len = skb->len;
563         unsigned int snaplen, res;
564         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
565         unsigned short macoff, netoff;
566         struct sk_buff *copy_skb = NULL;
567         struct timeval tv;
568
569         if (dev->nd_net != &init_net)
570                 goto drop;
571
572         if (skb->pkt_type == PACKET_LOOPBACK)
573                 goto drop;
574
575         sk = pt->af_packet_priv;
576         po = pkt_sk(sk);
577
578         if (dev->header_ops) {
579                 if (sk->sk_type != SOCK_DGRAM)
580                         skb_push(skb, skb->data - skb_mac_header(skb));
581                 else if (skb->pkt_type == PACKET_OUTGOING) {
582                         /* Special case: outgoing packets have ll header at head */
583                         skb_pull(skb, skb_network_offset(skb));
584                 }
585         }
586
587         if (skb->ip_summed == CHECKSUM_PARTIAL)
588                 status |= TP_STATUS_CSUMNOTREADY;
589
590         snaplen = skb->len;
591
592         res = run_filter(skb, sk, snaplen);
593         if (!res)
594                 goto drop_n_restore;
595         if (snaplen > res)
596                 snaplen = res;
597
598         if (sk->sk_type == SOCK_DGRAM) {
599                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
600         } else {
601                 unsigned maclen = skb_network_offset(skb);
602                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
603                 macoff = netoff - maclen;
604         }
605
606         if (macoff + snaplen > po->frame_size) {
607                 if (po->copy_thresh &&
608                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
609                     (unsigned)sk->sk_rcvbuf) {
610                         if (skb_shared(skb)) {
611                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
612                         } else {
613                                 copy_skb = skb_get(skb);
614                                 skb_head = skb->data;
615                         }
616                         if (copy_skb)
617                                 skb_set_owner_r(copy_skb, sk);
618                 }
619                 snaplen = po->frame_size - macoff;
620                 if ((int)snaplen < 0)
621                         snaplen = 0;
622         }
623
624         spin_lock(&sk->sk_receive_queue.lock);
625         h = packet_lookup_frame(po, po->head);
626
627         if (h->tp_status)
628                 goto ring_is_full;
629         po->head = po->head != po->frame_max ? po->head+1 : 0;
630         po->stats.tp_packets++;
631         if (copy_skb) {
632                 status |= TP_STATUS_COPY;
633                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
634         }
635         if (!po->stats.tp_drops)
636                 status &= ~TP_STATUS_LOSING;
637         spin_unlock(&sk->sk_receive_queue.lock);
638
639         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
640
641         h->tp_len = skb->len;
642         h->tp_snaplen = snaplen;
643         h->tp_mac = macoff;
644         h->tp_net = netoff;
645         if (skb->tstamp.tv64)
646                 tv = ktime_to_timeval(skb->tstamp);
647         else
648                 do_gettimeofday(&tv);
649         h->tp_sec = tv.tv_sec;
650         h->tp_usec = tv.tv_usec;
651
652         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
653         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
654         sll->sll_family = AF_PACKET;
655         sll->sll_hatype = dev->type;
656         sll->sll_protocol = skb->protocol;
657         sll->sll_pkttype = skb->pkt_type;
658         if (unlikely(po->origdev))
659                 sll->sll_ifindex = orig_dev->ifindex;
660         else
661                 sll->sll_ifindex = dev->ifindex;
662
663         h->tp_status = status;
664         smp_mb();
665
666         {
667                 struct page *p_start, *p_end;
668                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
669
670                 p_start = virt_to_page(h);
671                 p_end = virt_to_page(h_end);
672                 while (p_start <= p_end) {
673                         flush_dcache_page(p_start);
674                         p_start++;
675                 }
676         }
677
678         sk->sk_data_ready(sk, 0);
679
680 drop_n_restore:
681         if (skb_head != skb->data && skb_shared(skb)) {
682                 skb->data = skb_head;
683                 skb->len = skb_len;
684         }
685 drop:
686         kfree_skb(skb);
687         return 0;
688
689 ring_is_full:
690         po->stats.tp_drops++;
691         spin_unlock(&sk->sk_receive_queue.lock);
692
693         sk->sk_data_ready(sk, 0);
694         if (copy_skb)
695                 kfree_skb(copy_skb);
696         goto drop_n_restore;
697 }
698
699 #endif
700
701
702 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
703                           struct msghdr *msg, size_t len)
704 {
705         struct sock *sk = sock->sk;
706         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
707         struct sk_buff *skb;
708         struct net_device *dev;
709         __be16 proto;
710         unsigned char *addr;
711         int ifindex, err, reserve = 0;
712
713         /*
714          *      Get and verify the address.
715          */
716
717         if (saddr == NULL) {
718                 struct packet_sock *po = pkt_sk(sk);
719
720                 ifindex = po->ifindex;
721                 proto   = po->num;
722                 addr    = NULL;
723         } else {
724                 err = -EINVAL;
725                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
726                         goto out;
727                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
728                         goto out;
729                 ifindex = saddr->sll_ifindex;
730                 proto   = saddr->sll_protocol;
731                 addr    = saddr->sll_addr;
732         }
733
734
735         dev = dev_get_by_index(&init_net, ifindex);
736         err = -ENXIO;
737         if (dev == NULL)
738                 goto out_unlock;
739         if (sock->type == SOCK_RAW)
740                 reserve = dev->hard_header_len;
741
742         err = -ENETDOWN;
743         if (!(dev->flags & IFF_UP))
744                 goto out_unlock;
745
746         err = -EMSGSIZE;
747         if (len > dev->mtu+reserve)
748                 goto out_unlock;
749
750         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
751                                 msg->msg_flags & MSG_DONTWAIT, &err);
752         if (skb==NULL)
753                 goto out_unlock;
754
755         skb_reserve(skb, LL_RESERVED_SPACE(dev));
756         skb_reset_network_header(skb);
757
758         err = -EINVAL;
759         if (sock->type == SOCK_DGRAM &&
760             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
761                 goto out_free;
762
763         /* Returns -EFAULT on error */
764         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
765         if (err)
766                 goto out_free;
767
768         skb->protocol = proto;
769         skb->dev = dev;
770         skb->priority = sk->sk_priority;
771
772         /*
773          *      Now send it
774          */
775
776         err = dev_queue_xmit(skb);
777         if (err > 0 && (err = net_xmit_errno(err)) != 0)
778                 goto out_unlock;
779
780         dev_put(dev);
781
782         return(len);
783
784 out_free:
785         kfree_skb(skb);
786 out_unlock:
787         if (dev)
788                 dev_put(dev);
789 out:
790         return err;
791 }
792
793 /*
794  *      Close a PACKET socket. This is fairly simple. We immediately go
795  *      to 'closed' state and remove our protocol entry in the device list.
796  */
797
798 static int packet_release(struct socket *sock)
799 {
800         struct sock *sk = sock->sk;
801         struct packet_sock *po;
802
803         if (!sk)
804                 return 0;
805
806         po = pkt_sk(sk);
807
808         write_lock_bh(&packet_sklist_lock);
809         sk_del_node_init(sk);
810         write_unlock_bh(&packet_sklist_lock);
811
812         /*
813          *      Unhook packet receive handler.
814          */
815
816         if (po->running) {
817                 /*
818                  *      Remove the protocol hook
819                  */
820                 dev_remove_pack(&po->prot_hook);
821                 po->running = 0;
822                 po->num = 0;
823                 __sock_put(sk);
824         }
825
826         packet_flush_mclist(sk);
827
828 #ifdef CONFIG_PACKET_MMAP
829         if (po->pg_vec) {
830                 struct tpacket_req req;
831                 memset(&req, 0, sizeof(req));
832                 packet_set_ring(sk, &req, 1);
833         }
834 #endif
835
836         /*
837          *      Now the socket is dead. No more input will appear.
838          */
839
840         sock_orphan(sk);
841         sock->sk = NULL;
842
843         /* Purge queues */
844
845         skb_queue_purge(&sk->sk_receive_queue);
846         sk_refcnt_debug_release(sk);
847
848         sock_put(sk);
849         return 0;
850 }
851
852 /*
853  *      Attach a packet hook.
854  */
855
856 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
857 {
858         struct packet_sock *po = pkt_sk(sk);
859         /*
860          *      Detach an existing hook if present.
861          */
862
863         lock_sock(sk);
864
865         spin_lock(&po->bind_lock);
866         if (po->running) {
867                 __sock_put(sk);
868                 po->running = 0;
869                 po->num = 0;
870                 spin_unlock(&po->bind_lock);
871                 dev_remove_pack(&po->prot_hook);
872                 spin_lock(&po->bind_lock);
873         }
874
875         po->num = protocol;
876         po->prot_hook.type = protocol;
877         po->prot_hook.dev = dev;
878
879         po->ifindex = dev ? dev->ifindex : 0;
880
881         if (protocol == 0)
882                 goto out_unlock;
883
884         if (dev) {
885                 if (dev->flags&IFF_UP) {
886                         dev_add_pack(&po->prot_hook);
887                         sock_hold(sk);
888                         po->running = 1;
889                 } else {
890                         sk->sk_err = ENETDOWN;
891                         if (!sock_flag(sk, SOCK_DEAD))
892                                 sk->sk_error_report(sk);
893                 }
894         } else {
895                 dev_add_pack(&po->prot_hook);
896                 sock_hold(sk);
897                 po->running = 1;
898         }
899
900 out_unlock:
901         spin_unlock(&po->bind_lock);
902         release_sock(sk);
903         return 0;
904 }
905
906 /*
907  *      Bind a packet socket to a device
908  */
909
910 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
911 {
912         struct sock *sk=sock->sk;
913         char name[15];
914         struct net_device *dev;
915         int err = -ENODEV;
916
917         /*
918          *      Check legality
919          */
920
921         if (addr_len != sizeof(struct sockaddr))
922                 return -EINVAL;
923         strlcpy(name,uaddr->sa_data,sizeof(name));
924
925         dev = dev_get_by_name(&init_net, name);
926         if (dev) {
927                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
928                 dev_put(dev);
929         }
930         return err;
931 }
932
933 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
934 {
935         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
936         struct sock *sk=sock->sk;
937         struct net_device *dev = NULL;
938         int err;
939
940
941         /*
942          *      Check legality
943          */
944
945         if (addr_len < sizeof(struct sockaddr_ll))
946                 return -EINVAL;
947         if (sll->sll_family != AF_PACKET)
948                 return -EINVAL;
949
950         if (sll->sll_ifindex) {
951                 err = -ENODEV;
952                 dev = dev_get_by_index(&init_net, sll->sll_ifindex);
953                 if (dev == NULL)
954                         goto out;
955         }
956         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
957         if (dev)
958                 dev_put(dev);
959
960 out:
961         return err;
962 }
963
964 static struct proto packet_proto = {
965         .name     = "PACKET",
966         .owner    = THIS_MODULE,
967         .obj_size = sizeof(struct packet_sock),
968 };
969
970 /*
971  *      Create a packet of type SOCK_PACKET.
972  */
973
974 static int packet_create(struct net *net, struct socket *sock, int protocol)
975 {
976         struct sock *sk;
977         struct packet_sock *po;
978         __be16 proto = (__force __be16)protocol; /* weird, but documented */
979         int err;
980
981         if (net != &init_net)
982                 return -EAFNOSUPPORT;
983
984         if (!capable(CAP_NET_RAW))
985                 return -EPERM;
986         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
987             sock->type != SOCK_PACKET)
988                 return -ESOCKTNOSUPPORT;
989
990         sock->state = SS_UNCONNECTED;
991
992         err = -ENOBUFS;
993         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
994         if (sk == NULL)
995                 goto out;
996
997         sock->ops = &packet_ops;
998         if (sock->type == SOCK_PACKET)
999                 sock->ops = &packet_ops_spkt;
1000
1001         sock_init_data(sock, sk);
1002
1003         po = pkt_sk(sk);
1004         sk->sk_family = PF_PACKET;
1005         po->num = proto;
1006
1007         sk->sk_destruct = packet_sock_destruct;
1008         sk_refcnt_debug_inc(sk);
1009
1010         /*
1011          *      Attach a protocol block
1012          */
1013
1014         spin_lock_init(&po->bind_lock);
1015         po->prot_hook.func = packet_rcv;
1016
1017         if (sock->type == SOCK_PACKET)
1018                 po->prot_hook.func = packet_rcv_spkt;
1019
1020         po->prot_hook.af_packet_priv = sk;
1021
1022         if (proto) {
1023                 po->prot_hook.type = proto;
1024                 dev_add_pack(&po->prot_hook);
1025                 sock_hold(sk);
1026                 po->running = 1;
1027         }
1028
1029         write_lock_bh(&packet_sklist_lock);
1030         sk_add_node(sk, &packet_sklist);
1031         write_unlock_bh(&packet_sklist_lock);
1032         return(0);
1033 out:
1034         return err;
1035 }
1036
1037 /*
1038  *      Pull a packet from our receive queue and hand it to the user.
1039  *      If necessary we block.
1040  */
1041
1042 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1043                           struct msghdr *msg, size_t len, int flags)
1044 {
1045         struct sock *sk = sock->sk;
1046         struct sk_buff *skb;
1047         int copied, err;
1048         struct sockaddr_ll *sll;
1049
1050         err = -EINVAL;
1051         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1052                 goto out;
1053
1054 #if 0
1055         /* What error should we return now? EUNATTACH? */
1056         if (pkt_sk(sk)->ifindex < 0)
1057                 return -ENODEV;
1058 #endif
1059
1060         /*
1061          *      Call the generic datagram receiver. This handles all sorts
1062          *      of horrible races and re-entrancy so we can forget about it
1063          *      in the protocol layers.
1064          *
1065          *      Now it will return ENETDOWN, if device have just gone down,
1066          *      but then it will block.
1067          */
1068
1069         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1070
1071         /*
1072          *      An error occurred so return it. Because skb_recv_datagram()
1073          *      handles the blocking we don't see and worry about blocking
1074          *      retries.
1075          */
1076
1077         if (skb == NULL)
1078                 goto out;
1079
1080         /*
1081          *      If the address length field is there to be filled in, we fill
1082          *      it in now.
1083          */
1084
1085         sll = &PACKET_SKB_CB(skb)->sa.ll;
1086         if (sock->type == SOCK_PACKET)
1087                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1088         else
1089                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1090
1091         /*
1092          *      You lose any data beyond the buffer you gave. If it worries a
1093          *      user program they can ask the device for its MTU anyway.
1094          */
1095
1096         copied = skb->len;
1097         if (copied > len)
1098         {
1099                 copied=len;
1100                 msg->msg_flags|=MSG_TRUNC;
1101         }
1102
1103         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1104         if (err)
1105                 goto out_free;
1106
1107         sock_recv_timestamp(msg, sk, skb);
1108
1109         if (msg->msg_name)
1110                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1111                        msg->msg_namelen);
1112
1113         if (pkt_sk(sk)->auxdata) {
1114                 struct tpacket_auxdata aux;
1115
1116                 aux.tp_status = TP_STATUS_USER;
1117                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1118                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1119                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1120                 aux.tp_snaplen = skb->len;
1121                 aux.tp_mac = 0;
1122                 aux.tp_net = skb_network_offset(skb);
1123
1124                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1125         }
1126
1127         /*
1128          *      Free or return the buffer as appropriate. Again this
1129          *      hides all the races and re-entrancy issues from us.
1130          */
1131         err = (flags&MSG_TRUNC) ? skb->len : copied;
1132
1133 out_free:
1134         skb_free_datagram(sk, skb);
1135 out:
1136         return err;
1137 }
1138
1139 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1140                                int *uaddr_len, int peer)
1141 {
1142         struct net_device *dev;
1143         struct sock *sk = sock->sk;
1144
1145         if (peer)
1146                 return -EOPNOTSUPP;
1147
1148         uaddr->sa_family = AF_PACKET;
1149         dev = dev_get_by_index(&init_net, pkt_sk(sk)->ifindex);
1150         if (dev) {
1151                 strlcpy(uaddr->sa_data, dev->name, 15);
1152                 dev_put(dev);
1153         } else
1154                 memset(uaddr->sa_data, 0, 14);
1155         *uaddr_len = sizeof(*uaddr);
1156
1157         return 0;
1158 }
1159
1160 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1161                           int *uaddr_len, int peer)
1162 {
1163         struct net_device *dev;
1164         struct sock *sk = sock->sk;
1165         struct packet_sock *po = pkt_sk(sk);
1166         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1167
1168         if (peer)
1169                 return -EOPNOTSUPP;
1170
1171         sll->sll_family = AF_PACKET;
1172         sll->sll_ifindex = po->ifindex;
1173         sll->sll_protocol = po->num;
1174         dev = dev_get_by_index(&init_net, po->ifindex);
1175         if (dev) {
1176                 sll->sll_hatype = dev->type;
1177                 sll->sll_halen = dev->addr_len;
1178                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1179                 dev_put(dev);
1180         } else {
1181                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1182                 sll->sll_halen = 0;
1183         }
1184         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1185
1186         return 0;
1187 }
1188
1189 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1190 {
1191         switch (i->type) {
1192         case PACKET_MR_MULTICAST:
1193                 if (what > 0)
1194                         dev_mc_add(dev, i->addr, i->alen, 0);
1195                 else
1196                         dev_mc_delete(dev, i->addr, i->alen, 0);
1197                 break;
1198         case PACKET_MR_PROMISC:
1199                 dev_set_promiscuity(dev, what);
1200                 break;
1201         case PACKET_MR_ALLMULTI:
1202                 dev_set_allmulti(dev, what);
1203                 break;
1204         default:;
1205         }
1206 }
1207
1208 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1209 {
1210         for ( ; i; i=i->next) {
1211                 if (i->ifindex == dev->ifindex)
1212                         packet_dev_mc(dev, i, what);
1213         }
1214 }
1215
1216 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1217 {
1218         struct packet_sock *po = pkt_sk(sk);
1219         struct packet_mclist *ml, *i;
1220         struct net_device *dev;
1221         int err;
1222
1223         rtnl_lock();
1224
1225         err = -ENODEV;
1226         dev = __dev_get_by_index(&init_net, mreq->mr_ifindex);
1227         if (!dev)
1228                 goto done;
1229
1230         err = -EINVAL;
1231         if (mreq->mr_alen > dev->addr_len)
1232                 goto done;
1233
1234         err = -ENOBUFS;
1235         i = kmalloc(sizeof(*i), GFP_KERNEL);
1236         if (i == NULL)
1237                 goto done;
1238
1239         err = 0;
1240         for (ml = po->mclist; ml; ml = ml->next) {
1241                 if (ml->ifindex == mreq->mr_ifindex &&
1242                     ml->type == mreq->mr_type &&
1243                     ml->alen == mreq->mr_alen &&
1244                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1245                         ml->count++;
1246                         /* Free the new element ... */
1247                         kfree(i);
1248                         goto done;
1249                 }
1250         }
1251
1252         i->type = mreq->mr_type;
1253         i->ifindex = mreq->mr_ifindex;
1254         i->alen = mreq->mr_alen;
1255         memcpy(i->addr, mreq->mr_address, i->alen);
1256         i->count = 1;
1257         i->next = po->mclist;
1258         po->mclist = i;
1259         packet_dev_mc(dev, i, +1);
1260
1261 done:
1262         rtnl_unlock();
1263         return err;
1264 }
1265
1266 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1267 {
1268         struct packet_mclist *ml, **mlp;
1269
1270         rtnl_lock();
1271
1272         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1273                 if (ml->ifindex == mreq->mr_ifindex &&
1274                     ml->type == mreq->mr_type &&
1275                     ml->alen == mreq->mr_alen &&
1276                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1277                         if (--ml->count == 0) {
1278                                 struct net_device *dev;
1279                                 *mlp = ml->next;
1280                                 dev = dev_get_by_index(&init_net, ml->ifindex);
1281                                 if (dev) {
1282                                         packet_dev_mc(dev, ml, -1);
1283                                         dev_put(dev);
1284                                 }
1285                                 kfree(ml);
1286                         }
1287                         rtnl_unlock();
1288                         return 0;
1289                 }
1290         }
1291         rtnl_unlock();
1292         return -EADDRNOTAVAIL;
1293 }
1294
1295 static void packet_flush_mclist(struct sock *sk)
1296 {
1297         struct packet_sock *po = pkt_sk(sk);
1298         struct packet_mclist *ml;
1299
1300         if (!po->mclist)
1301                 return;
1302
1303         rtnl_lock();
1304         while ((ml = po->mclist) != NULL) {
1305                 struct net_device *dev;
1306
1307                 po->mclist = ml->next;
1308                 if ((dev = dev_get_by_index(&init_net, ml->ifindex)) != NULL) {
1309                         packet_dev_mc(dev, ml, -1);
1310                         dev_put(dev);
1311                 }
1312                 kfree(ml);
1313         }
1314         rtnl_unlock();
1315 }
1316
1317 static int
1318 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1319 {
1320         struct sock *sk = sock->sk;
1321         struct packet_sock *po = pkt_sk(sk);
1322         int ret;
1323
1324         if (level != SOL_PACKET)
1325                 return -ENOPROTOOPT;
1326
1327         switch(optname) {
1328         case PACKET_ADD_MEMBERSHIP:
1329         case PACKET_DROP_MEMBERSHIP:
1330         {
1331                 struct packet_mreq_max mreq;
1332                 int len = optlen;
1333                 memset(&mreq, 0, sizeof(mreq));
1334                 if (len < sizeof(struct packet_mreq))
1335                         return -EINVAL;
1336                 if (len > sizeof(mreq))
1337                         len = sizeof(mreq);
1338                 if (copy_from_user(&mreq,optval,len))
1339                         return -EFAULT;
1340                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1341                         return -EINVAL;
1342                 if (optname == PACKET_ADD_MEMBERSHIP)
1343                         ret = packet_mc_add(sk, &mreq);
1344                 else
1345                         ret = packet_mc_drop(sk, &mreq);
1346                 return ret;
1347         }
1348
1349 #ifdef CONFIG_PACKET_MMAP
1350         case PACKET_RX_RING:
1351         {
1352                 struct tpacket_req req;
1353
1354                 if (optlen<sizeof(req))
1355                         return -EINVAL;
1356                 if (copy_from_user(&req,optval,sizeof(req)))
1357                         return -EFAULT;
1358                 return packet_set_ring(sk, &req, 0);
1359         }
1360         case PACKET_COPY_THRESH:
1361         {
1362                 int val;
1363
1364                 if (optlen!=sizeof(val))
1365                         return -EINVAL;
1366                 if (copy_from_user(&val,optval,sizeof(val)))
1367                         return -EFAULT;
1368
1369                 pkt_sk(sk)->copy_thresh = val;
1370                 return 0;
1371         }
1372 #endif
1373         case PACKET_AUXDATA:
1374         {
1375                 int val;
1376
1377                 if (optlen < sizeof(val))
1378                         return -EINVAL;
1379                 if (copy_from_user(&val, optval, sizeof(val)))
1380                         return -EFAULT;
1381
1382                 po->auxdata = !!val;
1383                 return 0;
1384         }
1385         case PACKET_ORIGDEV:
1386         {
1387                 int val;
1388
1389                 if (optlen < sizeof(val))
1390                         return -EINVAL;
1391                 if (copy_from_user(&val, optval, sizeof(val)))
1392                         return -EFAULT;
1393
1394                 po->origdev = !!val;
1395                 return 0;
1396         }
1397         default:
1398                 return -ENOPROTOOPT;
1399         }
1400 }
1401
1402 static int packet_getsockopt(struct socket *sock, int level, int optname,
1403                              char __user *optval, int __user *optlen)
1404 {
1405         int len;
1406         int val;
1407         struct sock *sk = sock->sk;
1408         struct packet_sock *po = pkt_sk(sk);
1409         void *data;
1410         struct tpacket_stats st;
1411
1412         if (level != SOL_PACKET)
1413                 return -ENOPROTOOPT;
1414
1415         if (get_user(len, optlen))
1416                 return -EFAULT;
1417
1418         if (len < 0)
1419                 return -EINVAL;
1420
1421         switch(optname) {
1422         case PACKET_STATISTICS:
1423                 if (len > sizeof(struct tpacket_stats))
1424                         len = sizeof(struct tpacket_stats);
1425                 spin_lock_bh(&sk->sk_receive_queue.lock);
1426                 st = po->stats;
1427                 memset(&po->stats, 0, sizeof(st));
1428                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1429                 st.tp_packets += st.tp_drops;
1430
1431                 data = &st;
1432                 break;
1433         case PACKET_AUXDATA:
1434                 if (len > sizeof(int))
1435                         len = sizeof(int);
1436                 val = po->auxdata;
1437
1438                 data = &val;
1439                 break;
1440         case PACKET_ORIGDEV:
1441                 if (len > sizeof(int))
1442                         len = sizeof(int);
1443                 val = po->origdev;
1444
1445                 data = &val;
1446                 break;
1447         default:
1448                 return -ENOPROTOOPT;
1449         }
1450
1451         if (put_user(len, optlen))
1452                 return -EFAULT;
1453         if (copy_to_user(optval, data, len))
1454                 return -EFAULT;
1455         return 0;
1456 }
1457
1458
1459 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1460 {
1461         struct sock *sk;
1462         struct hlist_node *node;
1463         struct net_device *dev = data;
1464
1465         if (dev->nd_net != &init_net)
1466                 return NOTIFY_DONE;
1467
1468         read_lock(&packet_sklist_lock);
1469         sk_for_each(sk, node, &packet_sklist) {
1470                 struct packet_sock *po = pkt_sk(sk);
1471
1472                 switch (msg) {
1473                 case NETDEV_UNREGISTER:
1474                         if (po->mclist)
1475                                 packet_dev_mclist(dev, po->mclist, -1);
1476                         /* fallthrough */
1477
1478                 case NETDEV_DOWN:
1479                         if (dev->ifindex == po->ifindex) {
1480                                 spin_lock(&po->bind_lock);
1481                                 if (po->running) {
1482                                         __dev_remove_pack(&po->prot_hook);
1483                                         __sock_put(sk);
1484                                         po->running = 0;
1485                                         sk->sk_err = ENETDOWN;
1486                                         if (!sock_flag(sk, SOCK_DEAD))
1487                                                 sk->sk_error_report(sk);
1488                                 }
1489                                 if (msg == NETDEV_UNREGISTER) {
1490                                         po->ifindex = -1;
1491                                         po->prot_hook.dev = NULL;
1492                                 }
1493                                 spin_unlock(&po->bind_lock);
1494                         }
1495                         break;
1496                 case NETDEV_UP:
1497                         spin_lock(&po->bind_lock);
1498                         if (dev->ifindex == po->ifindex && po->num &&
1499                             !po->running) {
1500                                 dev_add_pack(&po->prot_hook);
1501                                 sock_hold(sk);
1502                                 po->running = 1;
1503                         }
1504                         spin_unlock(&po->bind_lock);
1505                         break;
1506                 }
1507         }
1508         read_unlock(&packet_sklist_lock);
1509         return NOTIFY_DONE;
1510 }
1511
1512
1513 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1514                         unsigned long arg)
1515 {
1516         struct sock *sk = sock->sk;
1517
1518         switch(cmd) {
1519                 case SIOCOUTQ:
1520                 {
1521                         int amount = atomic_read(&sk->sk_wmem_alloc);
1522                         return put_user(amount, (int __user *)arg);
1523                 }
1524                 case SIOCINQ:
1525                 {
1526                         struct sk_buff *skb;
1527                         int amount = 0;
1528
1529                         spin_lock_bh(&sk->sk_receive_queue.lock);
1530                         skb = skb_peek(&sk->sk_receive_queue);
1531                         if (skb)
1532                                 amount = skb->len;
1533                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1534                         return put_user(amount, (int __user *)arg);
1535                 }
1536                 case SIOCGSTAMP:
1537                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1538                 case SIOCGSTAMPNS:
1539                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1540
1541 #ifdef CONFIG_INET
1542                 case SIOCADDRT:
1543                 case SIOCDELRT:
1544                 case SIOCDARP:
1545                 case SIOCGARP:
1546                 case SIOCSARP:
1547                 case SIOCGIFADDR:
1548                 case SIOCSIFADDR:
1549                 case SIOCGIFBRDADDR:
1550                 case SIOCSIFBRDADDR:
1551                 case SIOCGIFNETMASK:
1552                 case SIOCSIFNETMASK:
1553                 case SIOCGIFDSTADDR:
1554                 case SIOCSIFDSTADDR:
1555                 case SIOCSIFFLAGS:
1556                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1557 #endif
1558
1559                 default:
1560                         return -ENOIOCTLCMD;
1561         }
1562         return 0;
1563 }
1564
1565 #ifndef CONFIG_PACKET_MMAP
1566 #define packet_mmap sock_no_mmap
1567 #define packet_poll datagram_poll
1568 #else
1569
1570 static unsigned int packet_poll(struct file * file, struct socket *sock,
1571                                 poll_table *wait)
1572 {
1573         struct sock *sk = sock->sk;
1574         struct packet_sock *po = pkt_sk(sk);
1575         unsigned int mask = datagram_poll(file, sock, wait);
1576
1577         spin_lock_bh(&sk->sk_receive_queue.lock);
1578         if (po->pg_vec) {
1579                 unsigned last = po->head ? po->head-1 : po->frame_max;
1580                 struct tpacket_hdr *h;
1581
1582                 h = packet_lookup_frame(po, last);
1583
1584                 if (h->tp_status)
1585                         mask |= POLLIN | POLLRDNORM;
1586         }
1587         spin_unlock_bh(&sk->sk_receive_queue.lock);
1588         return mask;
1589 }
1590
1591
1592 /* Dirty? Well, I still did not learn better way to account
1593  * for user mmaps.
1594  */
1595
1596 static void packet_mm_open(struct vm_area_struct *vma)
1597 {
1598         struct file *file = vma->vm_file;
1599         struct socket * sock = file->private_data;
1600         struct sock *sk = sock->sk;
1601
1602         if (sk)
1603                 atomic_inc(&pkt_sk(sk)->mapped);
1604 }
1605
1606 static void packet_mm_close(struct vm_area_struct *vma)
1607 {
1608         struct file *file = vma->vm_file;
1609         struct socket * sock = file->private_data;
1610         struct sock *sk = sock->sk;
1611
1612         if (sk)
1613                 atomic_dec(&pkt_sk(sk)->mapped);
1614 }
1615
1616 static struct vm_operations_struct packet_mmap_ops = {
1617         .open = packet_mm_open,
1618         .close =packet_mm_close,
1619 };
1620
1621 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1622 {
1623         int i;
1624
1625         for (i = 0; i < len; i++) {
1626                 if (likely(pg_vec[i]))
1627                         free_pages((unsigned long) pg_vec[i], order);
1628         }
1629         kfree(pg_vec);
1630 }
1631
1632 static inline char *alloc_one_pg_vec_page(unsigned long order)
1633 {
1634         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1635                                          order);
1636 }
1637
1638 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1639 {
1640         unsigned int block_nr = req->tp_block_nr;
1641         char **pg_vec;
1642         int i;
1643
1644         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1645         if (unlikely(!pg_vec))
1646                 goto out;
1647
1648         for (i = 0; i < block_nr; i++) {
1649                 pg_vec[i] = alloc_one_pg_vec_page(order);
1650                 if (unlikely(!pg_vec[i]))
1651                         goto out_free_pgvec;
1652         }
1653
1654 out:
1655         return pg_vec;
1656
1657 out_free_pgvec:
1658         free_pg_vec(pg_vec, order, block_nr);
1659         pg_vec = NULL;
1660         goto out;
1661 }
1662
1663 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1664 {
1665         char **pg_vec = NULL;
1666         struct packet_sock *po = pkt_sk(sk);
1667         int was_running, order = 0;
1668         __be16 num;
1669         int err = 0;
1670
1671         if (req->tp_block_nr) {
1672                 int i, l;
1673
1674                 /* Sanity tests and some calculations */
1675
1676                 if (unlikely(po->pg_vec))
1677                         return -EBUSY;
1678
1679                 if (unlikely((int)req->tp_block_size <= 0))
1680                         return -EINVAL;
1681                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1682                         return -EINVAL;
1683                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1684                         return -EINVAL;
1685                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1686                         return -EINVAL;
1687
1688                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1689                 if (unlikely(po->frames_per_block <= 0))
1690                         return -EINVAL;
1691                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1692                              req->tp_frame_nr))
1693                         return -EINVAL;
1694
1695                 err = -ENOMEM;
1696                 order = get_order(req->tp_block_size);
1697                 pg_vec = alloc_pg_vec(req, order);
1698                 if (unlikely(!pg_vec))
1699                         goto out;
1700
1701                 l = 0;
1702                 for (i = 0; i < req->tp_block_nr; i++) {
1703                         char *ptr = pg_vec[i];
1704                         struct tpacket_hdr *header;
1705                         int k;
1706
1707                         for (k = 0; k < po->frames_per_block; k++) {
1708                                 header = (struct tpacket_hdr *) ptr;
1709                                 header->tp_status = TP_STATUS_KERNEL;
1710                                 ptr += req->tp_frame_size;
1711                         }
1712                 }
1713                 /* Done */
1714         } else {
1715                 if (unlikely(req->tp_frame_nr))
1716                         return -EINVAL;
1717         }
1718
1719         lock_sock(sk);
1720
1721         /* Detach socket from network */
1722         spin_lock(&po->bind_lock);
1723         was_running = po->running;
1724         num = po->num;
1725         if (was_running) {
1726                 __dev_remove_pack(&po->prot_hook);
1727                 po->num = 0;
1728                 po->running = 0;
1729                 __sock_put(sk);
1730         }
1731         spin_unlock(&po->bind_lock);
1732
1733         synchronize_net();
1734
1735         err = -EBUSY;
1736         if (closing || atomic_read(&po->mapped) == 0) {
1737                 err = 0;
1738 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1739
1740                 spin_lock_bh(&sk->sk_receive_queue.lock);
1741                 pg_vec = XC(po->pg_vec, pg_vec);
1742                 po->frame_max = (req->tp_frame_nr - 1);
1743                 po->head = 0;
1744                 po->frame_size = req->tp_frame_size;
1745                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1746
1747                 order = XC(po->pg_vec_order, order);
1748                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1749
1750                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1751                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1752                 skb_queue_purge(&sk->sk_receive_queue);
1753 #undef XC
1754                 if (atomic_read(&po->mapped))
1755                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1756         }
1757
1758         spin_lock(&po->bind_lock);
1759         if (was_running && !po->running) {
1760                 sock_hold(sk);
1761                 po->running = 1;
1762                 po->num = num;
1763                 dev_add_pack(&po->prot_hook);
1764         }
1765         spin_unlock(&po->bind_lock);
1766
1767         release_sock(sk);
1768
1769         if (pg_vec)
1770                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1771 out:
1772         return err;
1773 }
1774
1775 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1776 {
1777         struct sock *sk = sock->sk;
1778         struct packet_sock *po = pkt_sk(sk);
1779         unsigned long size;
1780         unsigned long start;
1781         int err = -EINVAL;
1782         int i;
1783
1784         if (vma->vm_pgoff)
1785                 return -EINVAL;
1786
1787         size = vma->vm_end - vma->vm_start;
1788
1789         lock_sock(sk);
1790         if (po->pg_vec == NULL)
1791                 goto out;
1792         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1793                 goto out;
1794
1795         start = vma->vm_start;
1796         for (i = 0; i < po->pg_vec_len; i++) {
1797                 struct page *page = virt_to_page(po->pg_vec[i]);
1798                 int pg_num;
1799
1800                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1801                         err = vm_insert_page(vma, start, page);
1802                         if (unlikely(err))
1803                                 goto out;
1804                         start += PAGE_SIZE;
1805                 }
1806         }
1807         atomic_inc(&po->mapped);
1808         vma->vm_ops = &packet_mmap_ops;
1809         err = 0;
1810
1811 out:
1812         release_sock(sk);
1813         return err;
1814 }
1815 #endif
1816
1817
1818 static const struct proto_ops packet_ops_spkt = {
1819         .family =       PF_PACKET,
1820         .owner =        THIS_MODULE,
1821         .release =      packet_release,
1822         .bind =         packet_bind_spkt,
1823         .connect =      sock_no_connect,
1824         .socketpair =   sock_no_socketpair,
1825         .accept =       sock_no_accept,
1826         .getname =      packet_getname_spkt,
1827         .poll =         datagram_poll,
1828         .ioctl =        packet_ioctl,
1829         .listen =       sock_no_listen,
1830         .shutdown =     sock_no_shutdown,
1831         .setsockopt =   sock_no_setsockopt,
1832         .getsockopt =   sock_no_getsockopt,
1833         .sendmsg =      packet_sendmsg_spkt,
1834         .recvmsg =      packet_recvmsg,
1835         .mmap =         sock_no_mmap,
1836         .sendpage =     sock_no_sendpage,
1837 };
1838
1839 static const struct proto_ops packet_ops = {
1840         .family =       PF_PACKET,
1841         .owner =        THIS_MODULE,
1842         .release =      packet_release,
1843         .bind =         packet_bind,
1844         .connect =      sock_no_connect,
1845         .socketpair =   sock_no_socketpair,
1846         .accept =       sock_no_accept,
1847         .getname =      packet_getname,
1848         .poll =         packet_poll,
1849         .ioctl =        packet_ioctl,
1850         .listen =       sock_no_listen,
1851         .shutdown =     sock_no_shutdown,
1852         .setsockopt =   packet_setsockopt,
1853         .getsockopt =   packet_getsockopt,
1854         .sendmsg =      packet_sendmsg,
1855         .recvmsg =      packet_recvmsg,
1856         .mmap =         packet_mmap,
1857         .sendpage =     sock_no_sendpage,
1858 };
1859
1860 static struct net_proto_family packet_family_ops = {
1861         .family =       PF_PACKET,
1862         .create =       packet_create,
1863         .owner  =       THIS_MODULE,
1864 };
1865
1866 static struct notifier_block packet_netdev_notifier = {
1867         .notifier_call =packet_notifier,
1868 };
1869
1870 #ifdef CONFIG_PROC_FS
1871 static inline struct sock *packet_seq_idx(loff_t off)
1872 {
1873         struct sock *s;
1874         struct hlist_node *node;
1875
1876         sk_for_each(s, node, &packet_sklist) {
1877                 if (!off--)
1878                         return s;
1879         }
1880         return NULL;
1881 }
1882
1883 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1884 {
1885         read_lock(&packet_sklist_lock);
1886         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1887 }
1888
1889 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1890 {
1891         ++*pos;
1892         return  (v == SEQ_START_TOKEN)
1893                 ? sk_head(&packet_sklist)
1894                 : sk_next((struct sock*)v) ;
1895 }
1896
1897 static void packet_seq_stop(struct seq_file *seq, void *v)
1898 {
1899         read_unlock(&packet_sklist_lock);
1900 }
1901
1902 static int packet_seq_show(struct seq_file *seq, void *v)
1903 {
1904         if (v == SEQ_START_TOKEN)
1905                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1906         else {
1907                 struct sock *s = v;
1908                 const struct packet_sock *po = pkt_sk(s);
1909
1910                 seq_printf(seq,
1911                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1912                            s,
1913                            atomic_read(&s->sk_refcnt),
1914                            s->sk_type,
1915                            ntohs(po->num),
1916                            po->ifindex,
1917                            po->running,
1918                            atomic_read(&s->sk_rmem_alloc),
1919                            sock_i_uid(s),
1920                            sock_i_ino(s) );
1921         }
1922
1923         return 0;
1924 }
1925
1926 static const struct seq_operations packet_seq_ops = {
1927         .start  = packet_seq_start,
1928         .next   = packet_seq_next,
1929         .stop   = packet_seq_stop,
1930         .show   = packet_seq_show,
1931 };
1932
1933 static int packet_seq_open(struct inode *inode, struct file *file)
1934 {
1935         return seq_open(file, &packet_seq_ops);
1936 }
1937
1938 static const struct file_operations packet_seq_fops = {
1939         .owner          = THIS_MODULE,
1940         .open           = packet_seq_open,
1941         .read           = seq_read,
1942         .llseek         = seq_lseek,
1943         .release        = seq_release,
1944 };
1945
1946 #endif
1947
1948 static void __exit packet_exit(void)
1949 {
1950         proc_net_remove(&init_net, "packet");
1951         unregister_netdevice_notifier(&packet_netdev_notifier);
1952         sock_unregister(PF_PACKET);
1953         proto_unregister(&packet_proto);
1954 }
1955
1956 static int __init packet_init(void)
1957 {
1958         int rc = proto_register(&packet_proto, 0);
1959
1960         if (rc != 0)
1961                 goto out;
1962
1963         sock_register(&packet_family_ops);
1964         register_netdevice_notifier(&packet_netdev_notifier);
1965         proc_net_fops_create(&init_net, "packet", 0, &packet_seq_fops);
1966 out:
1967         return rc;
1968 }
1969
1970 module_init(packet_init);
1971 module_exit(packet_exit);
1972 MODULE_LICENSE("GPL");
1973 MODULE_ALIAS_NETPROTO(PF_PACKET);