Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *
43  *              This program is free software; you can redistribute it and/or
44  *              modify it under the terms of the GNU General Public License
45  *              as published by the Free Software Foundation; either version
46  *              2 of the License, or (at your option) any later version.
47  *
48  */
49
50 #include <linux/types.h>
51 #include <linux/mm.h>
52 #include <linux/capability.h>
53 #include <linux/fcntl.h>
54 #include <linux/socket.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/if_packet.h>
59 #include <linux/wireless.h>
60 #include <linux/kernel.h>
61 #include <linux/kmod.h>
62 #include <net/net_namespace.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <linux/skbuff.h>
66 #include <net/sock.h>
67 #include <linux/errno.h>
68 #include <linux/timer.h>
69 #include <asm/system.h>
70 #include <asm/uaccess.h>
71 #include <asm/ioctls.h>
72 #include <asm/page.h>
73 #include <asm/cacheflush.h>
74 #include <asm/io.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80
81 #ifdef CONFIG_INET
82 #include <net/inet_common.h>
83 #endif
84
85 /*
86    Assumptions:
87    - if device has no dev->hard_header routine, it adds and removes ll header
88      inside itself. In this case ll header is invisible outside of device,
89      but higher levels still should reserve dev->hard_header_len.
90      Some devices are enough clever to reallocate skb, when header
91      will not fit to reserved space (tunnel), another ones are silly
92      (PPP).
93    - packet socket receives packets with pulled ll header,
94      so that SOCK_RAW should push it back.
95
96 On receive:
97 -----------
98
99 Incoming, dev->hard_header!=NULL
100    mac_header -> ll header
101    data       -> data
102
103 Outgoing, dev->hard_header!=NULL
104    mac_header -> ll header
105    data       -> ll header
106
107 Incoming, dev->hard_header==NULL
108    mac_header -> UNKNOWN position. It is very likely, that it points to ll
109                  header.  PPP makes it, that is wrong, because introduce
110                  assymetry between rx and tx paths.
111    data       -> data
112
113 Outgoing, dev->hard_header==NULL
114    mac_header -> data. ll header is still not built!
115    data       -> data
116
117 Resume
118   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
119
120
121 On transmit:
122 ------------
123
124 dev->hard_header != NULL
125    mac_header -> ll header
126    data       -> ll header
127
128 dev->hard_header == NULL (ll header is added by device, we cannot control it)
129    mac_header -> data
130    data       -> data
131
132    We should set nh.raw on output to correct posistion,
133    packet classifier depends on it.
134  */
135
136 /* Private packet socket structures. */
137
138 struct packet_mclist
139 {
140         struct packet_mclist    *next;
141         int                     ifindex;
142         int                     count;
143         unsigned short          type;
144         unsigned short          alen;
145         unsigned char           addr[MAX_ADDR_LEN];
146 };
147 /* identical to struct packet_mreq except it has
148  * a longer address field.
149  */
150 struct packet_mreq_max
151 {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
160 #endif
161
162 static void packet_flush_mclist(struct sock *sk);
163
164 struct packet_sock {
165         /* struct sock has to be the first member of packet_sock */
166         struct sock             sk;
167         struct tpacket_stats    stats;
168 #ifdef CONFIG_PACKET_MMAP
169         char *                  *pg_vec;
170         unsigned int            head;
171         unsigned int            frames_per_block;
172         unsigned int            frame_size;
173         unsigned int            frame_max;
174         int                     copy_thresh;
175 #endif
176         struct packet_type      prot_hook;
177         spinlock_t              bind_lock;
178         unsigned int            running:1,      /* prot_hook is attached*/
179                                 auxdata:1,
180                                 origdev:1;
181         int                     ifindex;        /* bound device         */
182         __be16                  num;
183         struct packet_mclist    *mclist;
184 #ifdef CONFIG_PACKET_MMAP
185         atomic_t                mapped;
186         unsigned int            pg_vec_order;
187         unsigned int            pg_vec_pages;
188         unsigned int            pg_vec_len;
189 #endif
190 };
191
192 struct packet_skb_cb {
193         unsigned int origlen;
194         union {
195                 struct sockaddr_pkt pkt;
196                 struct sockaddr_ll ll;
197         } sa;
198 };
199
200 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
201
202 #ifdef CONFIG_PACKET_MMAP
203
204 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
205 {
206         unsigned int pg_vec_pos, frame_offset;
207
208         pg_vec_pos = position / po->frames_per_block;
209         frame_offset = position % po->frames_per_block;
210
211         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
212 }
213 #endif
214
215 static inline struct packet_sock *pkt_sk(struct sock *sk)
216 {
217         return (struct packet_sock *)sk;
218 }
219
220 static void packet_sock_destruct(struct sock *sk)
221 {
222         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
223         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
224
225         if (!sock_flag(sk, SOCK_DEAD)) {
226                 printk("Attempt to release alive packet socket: %p\n", sk);
227                 return;
228         }
229
230         sk_refcnt_debug_dec(sk);
231 }
232
233
234 static const struct proto_ops packet_ops;
235
236 static const struct proto_ops packet_ops_spkt;
237
238 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
239 {
240         struct sock *sk;
241         struct sockaddr_pkt *spkt;
242
243         /*
244          *      When we registered the protocol we saved the socket in the data
245          *      field for just this event.
246          */
247
248         sk = pt->af_packet_priv;
249
250         /*
251          *      Yank back the headers [hope the device set this
252          *      right or kerboom...]
253          *
254          *      Incoming packets have ll header pulled,
255          *      push it back.
256          *
257          *      For outgoing ones skb->data == skb_mac_header(skb)
258          *      so that this procedure is noop.
259          */
260
261         if (skb->pkt_type == PACKET_LOOPBACK)
262                 goto out;
263
264         if (dev_net(dev) != sock_net(sk))
265                 goto out;
266
267         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
268                 goto oom;
269
270         /* drop any routing info */
271         dst_release(skb->dst);
272         skb->dst = NULL;
273
274         /* drop conntrack reference */
275         nf_reset(skb);
276
277         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
278
279         skb_push(skb, skb->data - skb_mac_header(skb));
280
281         /*
282          *      The SOCK_PACKET socket receives _all_ frames.
283          */
284
285         spkt->spkt_family = dev->type;
286         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
287         spkt->spkt_protocol = skb->protocol;
288
289         /*
290          *      Charge the memory to the socket. This is done specifically
291          *      to prevent sockets using all the memory up.
292          */
293
294         if (sock_queue_rcv_skb(sk,skb) == 0)
295                 return 0;
296
297 out:
298         kfree_skb(skb);
299 oom:
300         return 0;
301 }
302
303
304 /*
305  *      Output a raw packet to a device layer. This bypasses all the other
306  *      protocol layers and you must therefore supply it with a complete frame
307  */
308
309 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
310                                struct msghdr *msg, size_t len)
311 {
312         struct sock *sk = sock->sk;
313         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
314         struct sk_buff *skb;
315         struct net_device *dev;
316         __be16 proto=0;
317         int err;
318
319         /*
320          *      Get and verify the address.
321          */
322
323         if (saddr)
324         {
325                 if (msg->msg_namelen < sizeof(struct sockaddr))
326                         return(-EINVAL);
327                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
328                         proto=saddr->spkt_protocol;
329         }
330         else
331                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
332
333         /*
334          *      Find the device first to size check it
335          */
336
337         saddr->spkt_device[13] = 0;
338         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
339         err = -ENODEV;
340         if (dev == NULL)
341                 goto out_unlock;
342
343         err = -ENETDOWN;
344         if (!(dev->flags & IFF_UP))
345                 goto out_unlock;
346
347         /*
348          *      You may not queue a frame bigger than the mtu. This is the lowest level
349          *      raw protocol and you must do your own fragmentation at this level.
350          */
351
352         err = -EMSGSIZE;
353         if (len > dev->mtu + dev->hard_header_len)
354                 goto out_unlock;
355
356         err = -ENOBUFS;
357         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
358
359         /*
360          *      If the write buffer is full, then tough. At this level the user gets to
361          *      deal with the problem - do your own algorithmic backoffs. That's far
362          *      more flexible.
363          */
364
365         if (skb == NULL)
366                 goto out_unlock;
367
368         /*
369          *      Fill it in
370          */
371
372         /* FIXME: Save some space for broken drivers that write a
373          * hard header at transmission time by themselves. PPP is the
374          * notable one here. This should really be fixed at the driver level.
375          */
376         skb_reserve(skb, LL_RESERVED_SPACE(dev));
377         skb_reset_network_header(skb);
378
379         /* Try to align data part correctly */
380         if (dev->header_ops) {
381                 skb->data -= dev->hard_header_len;
382                 skb->tail -= dev->hard_header_len;
383                 if (len < dev->hard_header_len)
384                         skb_reset_network_header(skb);
385         }
386
387         /* Returns -EFAULT on error */
388         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
389         skb->protocol = proto;
390         skb->dev = dev;
391         skb->priority = sk->sk_priority;
392         if (err)
393                 goto out_free;
394
395         /*
396          *      Now send it
397          */
398
399         dev_queue_xmit(skb);
400         dev_put(dev);
401         return(len);
402
403 out_free:
404         kfree_skb(skb);
405 out_unlock:
406         if (dev)
407                 dev_put(dev);
408         return err;
409 }
410
411 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
412                                       unsigned int res)
413 {
414         struct sk_filter *filter;
415
416         rcu_read_lock_bh();
417         filter = rcu_dereference(sk->sk_filter);
418         if (filter != NULL)
419                 res = sk_run_filter(skb, filter->insns, filter->len);
420         rcu_read_unlock_bh();
421
422         return res;
423 }
424
425 /*
426    This function makes lazy skb cloning in hope that most of packets
427    are discarded by BPF.
428
429    Note tricky part: we DO mangle shared skb! skb->data, skb->len
430    and skb->cb are mangled. It works because (and until) packets
431    falling here are owned by current CPU. Output packets are cloned
432    by dev_queue_xmit_nit(), input packets are processed by net_bh
433    sequencially, so that if we return skb to original state on exit,
434    we will not harm anyone.
435  */
436
437 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
438 {
439         struct sock *sk;
440         struct sockaddr_ll *sll;
441         struct packet_sock *po;
442         u8 * skb_head = skb->data;
443         int skb_len = skb->len;
444         unsigned int snaplen, res;
445
446         if (skb->pkt_type == PACKET_LOOPBACK)
447                 goto drop;
448
449         sk = pt->af_packet_priv;
450         po = pkt_sk(sk);
451
452         if (dev_net(dev) != sock_net(sk))
453                 goto drop;
454
455         skb->dev = dev;
456
457         if (dev->header_ops) {
458                 /* The device has an explicit notion of ll header,
459                    exported to higher levels.
460
461                    Otherwise, the device hides datails of it frame
462                    structure, so that corresponding packet head
463                    never delivered to user.
464                  */
465                 if (sk->sk_type != SOCK_DGRAM)
466                         skb_push(skb, skb->data - skb_mac_header(skb));
467                 else if (skb->pkt_type == PACKET_OUTGOING) {
468                         /* Special case: outgoing packets have ll header at head */
469                         skb_pull(skb, skb_network_offset(skb));
470                 }
471         }
472
473         snaplen = skb->len;
474
475         res = run_filter(skb, sk, snaplen);
476         if (!res)
477                 goto drop_n_restore;
478         if (snaplen > res)
479                 snaplen = res;
480
481         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
482             (unsigned)sk->sk_rcvbuf)
483                 goto drop_n_acct;
484
485         if (skb_shared(skb)) {
486                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
487                 if (nskb == NULL)
488                         goto drop_n_acct;
489
490                 if (skb_head != skb->data) {
491                         skb->data = skb_head;
492                         skb->len = skb_len;
493                 }
494                 kfree_skb(skb);
495                 skb = nskb;
496         }
497
498         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
499                      sizeof(skb->cb));
500
501         sll = &PACKET_SKB_CB(skb)->sa.ll;
502         sll->sll_family = AF_PACKET;
503         sll->sll_hatype = dev->type;
504         sll->sll_protocol = skb->protocol;
505         sll->sll_pkttype = skb->pkt_type;
506         if (unlikely(po->origdev))
507                 sll->sll_ifindex = orig_dev->ifindex;
508         else
509                 sll->sll_ifindex = dev->ifindex;
510
511         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
512
513         PACKET_SKB_CB(skb)->origlen = skb->len;
514
515         if (pskb_trim(skb, snaplen))
516                 goto drop_n_acct;
517
518         skb_set_owner_r(skb, sk);
519         skb->dev = NULL;
520         dst_release(skb->dst);
521         skb->dst = NULL;
522
523         /* drop conntrack reference */
524         nf_reset(skb);
525
526         spin_lock(&sk->sk_receive_queue.lock);
527         po->stats.tp_packets++;
528         __skb_queue_tail(&sk->sk_receive_queue, skb);
529         spin_unlock(&sk->sk_receive_queue.lock);
530         sk->sk_data_ready(sk, skb->len);
531         return 0;
532
533 drop_n_acct:
534         spin_lock(&sk->sk_receive_queue.lock);
535         po->stats.tp_drops++;
536         spin_unlock(&sk->sk_receive_queue.lock);
537
538 drop_n_restore:
539         if (skb_head != skb->data && skb_shared(skb)) {
540                 skb->data = skb_head;
541                 skb->len = skb_len;
542         }
543 drop:
544         kfree_skb(skb);
545         return 0;
546 }
547
548 #ifdef CONFIG_PACKET_MMAP
549 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
550 {
551         struct sock *sk;
552         struct packet_sock *po;
553         struct sockaddr_ll *sll;
554         struct tpacket_hdr *h;
555         u8 * skb_head = skb->data;
556         int skb_len = skb->len;
557         unsigned int snaplen, res;
558         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
559         unsigned short macoff, netoff;
560         struct sk_buff *copy_skb = NULL;
561         struct timeval tv;
562
563         if (skb->pkt_type == PACKET_LOOPBACK)
564                 goto drop;
565
566         sk = pt->af_packet_priv;
567         po = pkt_sk(sk);
568
569         if (dev_net(dev) != sock_net(sk))
570                 goto drop;
571
572         if (dev->header_ops) {
573                 if (sk->sk_type != SOCK_DGRAM)
574                         skb_push(skb, skb->data - skb_mac_header(skb));
575                 else if (skb->pkt_type == PACKET_OUTGOING) {
576                         /* Special case: outgoing packets have ll header at head */
577                         skb_pull(skb, skb_network_offset(skb));
578                 }
579         }
580
581         if (skb->ip_summed == CHECKSUM_PARTIAL)
582                 status |= TP_STATUS_CSUMNOTREADY;
583
584         snaplen = skb->len;
585
586         res = run_filter(skb, sk, snaplen);
587         if (!res)
588                 goto drop_n_restore;
589         if (snaplen > res)
590                 snaplen = res;
591
592         if (sk->sk_type == SOCK_DGRAM) {
593                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
594         } else {
595                 unsigned maclen = skb_network_offset(skb);
596                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
597                 macoff = netoff - maclen;
598         }
599
600         if (macoff + snaplen > po->frame_size) {
601                 if (po->copy_thresh &&
602                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
603                     (unsigned)sk->sk_rcvbuf) {
604                         if (skb_shared(skb)) {
605                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
606                         } else {
607                                 copy_skb = skb_get(skb);
608                                 skb_head = skb->data;
609                         }
610                         if (copy_skb)
611                                 skb_set_owner_r(copy_skb, sk);
612                 }
613                 snaplen = po->frame_size - macoff;
614                 if ((int)snaplen < 0)
615                         snaplen = 0;
616         }
617
618         spin_lock(&sk->sk_receive_queue.lock);
619         h = packet_lookup_frame(po, po->head);
620
621         if (h->tp_status)
622                 goto ring_is_full;
623         po->head = po->head != po->frame_max ? po->head+1 : 0;
624         po->stats.tp_packets++;
625         if (copy_skb) {
626                 status |= TP_STATUS_COPY;
627                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
628         }
629         if (!po->stats.tp_drops)
630                 status &= ~TP_STATUS_LOSING;
631         spin_unlock(&sk->sk_receive_queue.lock);
632
633         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
634
635         h->tp_len = skb->len;
636         h->tp_snaplen = snaplen;
637         h->tp_mac = macoff;
638         h->tp_net = netoff;
639         if (skb->tstamp.tv64)
640                 tv = ktime_to_timeval(skb->tstamp);
641         else
642                 do_gettimeofday(&tv);
643         h->tp_sec = tv.tv_sec;
644         h->tp_usec = tv.tv_usec;
645
646         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
647         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
648         sll->sll_family = AF_PACKET;
649         sll->sll_hatype = dev->type;
650         sll->sll_protocol = skb->protocol;
651         sll->sll_pkttype = skb->pkt_type;
652         if (unlikely(po->origdev))
653                 sll->sll_ifindex = orig_dev->ifindex;
654         else
655                 sll->sll_ifindex = dev->ifindex;
656
657         h->tp_status = status;
658         smp_mb();
659
660         {
661                 struct page *p_start, *p_end;
662                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
663
664                 p_start = virt_to_page(h);
665                 p_end = virt_to_page(h_end);
666                 while (p_start <= p_end) {
667                         flush_dcache_page(p_start);
668                         p_start++;
669                 }
670         }
671
672         sk->sk_data_ready(sk, 0);
673
674 drop_n_restore:
675         if (skb_head != skb->data && skb_shared(skb)) {
676                 skb->data = skb_head;
677                 skb->len = skb_len;
678         }
679 drop:
680         kfree_skb(skb);
681         return 0;
682
683 ring_is_full:
684         po->stats.tp_drops++;
685         spin_unlock(&sk->sk_receive_queue.lock);
686
687         sk->sk_data_ready(sk, 0);
688         if (copy_skb)
689                 kfree_skb(copy_skb);
690         goto drop_n_restore;
691 }
692
693 #endif
694
695
696 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
697                           struct msghdr *msg, size_t len)
698 {
699         struct sock *sk = sock->sk;
700         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
701         struct sk_buff *skb;
702         struct net_device *dev;
703         __be16 proto;
704         unsigned char *addr;
705         int ifindex, err, reserve = 0;
706
707         /*
708          *      Get and verify the address.
709          */
710
711         if (saddr == NULL) {
712                 struct packet_sock *po = pkt_sk(sk);
713
714                 ifindex = po->ifindex;
715                 proto   = po->num;
716                 addr    = NULL;
717         } else {
718                 err = -EINVAL;
719                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
720                         goto out;
721                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
722                         goto out;
723                 ifindex = saddr->sll_ifindex;
724                 proto   = saddr->sll_protocol;
725                 addr    = saddr->sll_addr;
726         }
727
728
729         dev = dev_get_by_index(sock_net(sk), ifindex);
730         err = -ENXIO;
731         if (dev == NULL)
732                 goto out_unlock;
733         if (sock->type == SOCK_RAW)
734                 reserve = dev->hard_header_len;
735
736         err = -ENETDOWN;
737         if (!(dev->flags & IFF_UP))
738                 goto out_unlock;
739
740         err = -EMSGSIZE;
741         if (len > dev->mtu+reserve)
742                 goto out_unlock;
743
744         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
745                                 msg->msg_flags & MSG_DONTWAIT, &err);
746         if (skb==NULL)
747                 goto out_unlock;
748
749         skb_reserve(skb, LL_RESERVED_SPACE(dev));
750         skb_reset_network_header(skb);
751
752         err = -EINVAL;
753         if (sock->type == SOCK_DGRAM &&
754             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
755                 goto out_free;
756
757         /* Returns -EFAULT on error */
758         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
759         if (err)
760                 goto out_free;
761
762         skb->protocol = proto;
763         skb->dev = dev;
764         skb->priority = sk->sk_priority;
765
766         /*
767          *      Now send it
768          */
769
770         err = dev_queue_xmit(skb);
771         if (err > 0 && (err = net_xmit_errno(err)) != 0)
772                 goto out_unlock;
773
774         dev_put(dev);
775
776         return(len);
777
778 out_free:
779         kfree_skb(skb);
780 out_unlock:
781         if (dev)
782                 dev_put(dev);
783 out:
784         return err;
785 }
786
787 /*
788  *      Close a PACKET socket. This is fairly simple. We immediately go
789  *      to 'closed' state and remove our protocol entry in the device list.
790  */
791
792 static int packet_release(struct socket *sock)
793 {
794         struct sock *sk = sock->sk;
795         struct packet_sock *po;
796         struct net *net;
797
798         if (!sk)
799                 return 0;
800
801         net = sock_net(sk);
802         po = pkt_sk(sk);
803
804         write_lock_bh(&net->packet.sklist_lock);
805         sk_del_node_init(sk);
806         write_unlock_bh(&net->packet.sklist_lock);
807
808         /*
809          *      Unhook packet receive handler.
810          */
811
812         if (po->running) {
813                 /*
814                  *      Remove the protocol hook
815                  */
816                 dev_remove_pack(&po->prot_hook);
817                 po->running = 0;
818                 po->num = 0;
819                 __sock_put(sk);
820         }
821
822         packet_flush_mclist(sk);
823
824 #ifdef CONFIG_PACKET_MMAP
825         if (po->pg_vec) {
826                 struct tpacket_req req;
827                 memset(&req, 0, sizeof(req));
828                 packet_set_ring(sk, &req, 1);
829         }
830 #endif
831
832         /*
833          *      Now the socket is dead. No more input will appear.
834          */
835
836         sock_orphan(sk);
837         sock->sk = NULL;
838
839         /* Purge queues */
840
841         skb_queue_purge(&sk->sk_receive_queue);
842         sk_refcnt_debug_release(sk);
843
844         sock_put(sk);
845         return 0;
846 }
847
848 /*
849  *      Attach a packet hook.
850  */
851
852 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
853 {
854         struct packet_sock *po = pkt_sk(sk);
855         /*
856          *      Detach an existing hook if present.
857          */
858
859         lock_sock(sk);
860
861         spin_lock(&po->bind_lock);
862         if (po->running) {
863                 __sock_put(sk);
864                 po->running = 0;
865                 po->num = 0;
866                 spin_unlock(&po->bind_lock);
867                 dev_remove_pack(&po->prot_hook);
868                 spin_lock(&po->bind_lock);
869         }
870
871         po->num = protocol;
872         po->prot_hook.type = protocol;
873         po->prot_hook.dev = dev;
874
875         po->ifindex = dev ? dev->ifindex : 0;
876
877         if (protocol == 0)
878                 goto out_unlock;
879
880         if (!dev || (dev->flags & IFF_UP)) {
881                 dev_add_pack(&po->prot_hook);
882                 sock_hold(sk);
883                 po->running = 1;
884         } else {
885                 sk->sk_err = ENETDOWN;
886                 if (!sock_flag(sk, SOCK_DEAD))
887                         sk->sk_error_report(sk);
888         }
889
890 out_unlock:
891         spin_unlock(&po->bind_lock);
892         release_sock(sk);
893         return 0;
894 }
895
896 /*
897  *      Bind a packet socket to a device
898  */
899
900 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
901 {
902         struct sock *sk=sock->sk;
903         char name[15];
904         struct net_device *dev;
905         int err = -ENODEV;
906
907         /*
908          *      Check legality
909          */
910
911         if (addr_len != sizeof(struct sockaddr))
912                 return -EINVAL;
913         strlcpy(name,uaddr->sa_data,sizeof(name));
914
915         dev = dev_get_by_name(sock_net(sk), name);
916         if (dev) {
917                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
918                 dev_put(dev);
919         }
920         return err;
921 }
922
923 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
924 {
925         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
926         struct sock *sk=sock->sk;
927         struct net_device *dev = NULL;
928         int err;
929
930
931         /*
932          *      Check legality
933          */
934
935         if (addr_len < sizeof(struct sockaddr_ll))
936                 return -EINVAL;
937         if (sll->sll_family != AF_PACKET)
938                 return -EINVAL;
939
940         if (sll->sll_ifindex) {
941                 err = -ENODEV;
942                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
943                 if (dev == NULL)
944                         goto out;
945         }
946         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
947         if (dev)
948                 dev_put(dev);
949
950 out:
951         return err;
952 }
953
954 static struct proto packet_proto = {
955         .name     = "PACKET",
956         .owner    = THIS_MODULE,
957         .obj_size = sizeof(struct packet_sock),
958 };
959
960 /*
961  *      Create a packet of type SOCK_PACKET.
962  */
963
964 static int packet_create(struct net *net, struct socket *sock, int protocol)
965 {
966         struct sock *sk;
967         struct packet_sock *po;
968         __be16 proto = (__force __be16)protocol; /* weird, but documented */
969         int err;
970
971         if (!capable(CAP_NET_RAW))
972                 return -EPERM;
973         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
974             sock->type != SOCK_PACKET)
975                 return -ESOCKTNOSUPPORT;
976
977         sock->state = SS_UNCONNECTED;
978
979         err = -ENOBUFS;
980         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
981         if (sk == NULL)
982                 goto out;
983
984         sock->ops = &packet_ops;
985         if (sock->type == SOCK_PACKET)
986                 sock->ops = &packet_ops_spkt;
987
988         sock_init_data(sock, sk);
989
990         po = pkt_sk(sk);
991         sk->sk_family = PF_PACKET;
992         po->num = proto;
993
994         sk->sk_destruct = packet_sock_destruct;
995         sk_refcnt_debug_inc(sk);
996
997         /*
998          *      Attach a protocol block
999          */
1000
1001         spin_lock_init(&po->bind_lock);
1002         po->prot_hook.func = packet_rcv;
1003
1004         if (sock->type == SOCK_PACKET)
1005                 po->prot_hook.func = packet_rcv_spkt;
1006
1007         po->prot_hook.af_packet_priv = sk;
1008
1009         if (proto) {
1010                 po->prot_hook.type = proto;
1011                 dev_add_pack(&po->prot_hook);
1012                 sock_hold(sk);
1013                 po->running = 1;
1014         }
1015
1016         write_lock_bh(&net->packet.sklist_lock);
1017         sk_add_node(sk, &net->packet.sklist);
1018         write_unlock_bh(&net->packet.sklist_lock);
1019         return(0);
1020 out:
1021         return err;
1022 }
1023
1024 /*
1025  *      Pull a packet from our receive queue and hand it to the user.
1026  *      If necessary we block.
1027  */
1028
1029 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1030                           struct msghdr *msg, size_t len, int flags)
1031 {
1032         struct sock *sk = sock->sk;
1033         struct sk_buff *skb;
1034         int copied, err;
1035         struct sockaddr_ll *sll;
1036
1037         err = -EINVAL;
1038         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1039                 goto out;
1040
1041 #if 0
1042         /* What error should we return now? EUNATTACH? */
1043         if (pkt_sk(sk)->ifindex < 0)
1044                 return -ENODEV;
1045 #endif
1046
1047         /*
1048          *      Call the generic datagram receiver. This handles all sorts
1049          *      of horrible races and re-entrancy so we can forget about it
1050          *      in the protocol layers.
1051          *
1052          *      Now it will return ENETDOWN, if device have just gone down,
1053          *      but then it will block.
1054          */
1055
1056         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1057
1058         /*
1059          *      An error occurred so return it. Because skb_recv_datagram()
1060          *      handles the blocking we don't see and worry about blocking
1061          *      retries.
1062          */
1063
1064         if (skb == NULL)
1065                 goto out;
1066
1067         /*
1068          *      If the address length field is there to be filled in, we fill
1069          *      it in now.
1070          */
1071
1072         sll = &PACKET_SKB_CB(skb)->sa.ll;
1073         if (sock->type == SOCK_PACKET)
1074                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1075         else
1076                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1077
1078         /*
1079          *      You lose any data beyond the buffer you gave. If it worries a
1080          *      user program they can ask the device for its MTU anyway.
1081          */
1082
1083         copied = skb->len;
1084         if (copied > len)
1085         {
1086                 copied=len;
1087                 msg->msg_flags|=MSG_TRUNC;
1088         }
1089
1090         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1091         if (err)
1092                 goto out_free;
1093
1094         sock_recv_timestamp(msg, sk, skb);
1095
1096         if (msg->msg_name)
1097                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1098                        msg->msg_namelen);
1099
1100         if (pkt_sk(sk)->auxdata) {
1101                 struct tpacket_auxdata aux;
1102
1103                 aux.tp_status = TP_STATUS_USER;
1104                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1105                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1106                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1107                 aux.tp_snaplen = skb->len;
1108                 aux.tp_mac = 0;
1109                 aux.tp_net = skb_network_offset(skb);
1110
1111                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1112         }
1113
1114         /*
1115          *      Free or return the buffer as appropriate. Again this
1116          *      hides all the races and re-entrancy issues from us.
1117          */
1118         err = (flags&MSG_TRUNC) ? skb->len : copied;
1119
1120 out_free:
1121         skb_free_datagram(sk, skb);
1122 out:
1123         return err;
1124 }
1125
1126 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1127                                int *uaddr_len, int peer)
1128 {
1129         struct net_device *dev;
1130         struct sock *sk = sock->sk;
1131
1132         if (peer)
1133                 return -EOPNOTSUPP;
1134
1135         uaddr->sa_family = AF_PACKET;
1136         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1137         if (dev) {
1138                 strlcpy(uaddr->sa_data, dev->name, 15);
1139                 dev_put(dev);
1140         } else
1141                 memset(uaddr->sa_data, 0, 14);
1142         *uaddr_len = sizeof(*uaddr);
1143
1144         return 0;
1145 }
1146
1147 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1148                           int *uaddr_len, int peer)
1149 {
1150         struct net_device *dev;
1151         struct sock *sk = sock->sk;
1152         struct packet_sock *po = pkt_sk(sk);
1153         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1154
1155         if (peer)
1156                 return -EOPNOTSUPP;
1157
1158         sll->sll_family = AF_PACKET;
1159         sll->sll_ifindex = po->ifindex;
1160         sll->sll_protocol = po->num;
1161         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1162         if (dev) {
1163                 sll->sll_hatype = dev->type;
1164                 sll->sll_halen = dev->addr_len;
1165                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1166                 dev_put(dev);
1167         } else {
1168                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1169                 sll->sll_halen = 0;
1170         }
1171         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1172
1173         return 0;
1174 }
1175
1176 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1177 {
1178         switch (i->type) {
1179         case PACKET_MR_MULTICAST:
1180                 if (what > 0)
1181                         dev_mc_add(dev, i->addr, i->alen, 0);
1182                 else
1183                         dev_mc_delete(dev, i->addr, i->alen, 0);
1184                 break;
1185         case PACKET_MR_PROMISC:
1186                 dev_set_promiscuity(dev, what);
1187                 break;
1188         case PACKET_MR_ALLMULTI:
1189                 dev_set_allmulti(dev, what);
1190                 break;
1191         default:;
1192         }
1193 }
1194
1195 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1196 {
1197         for ( ; i; i=i->next) {
1198                 if (i->ifindex == dev->ifindex)
1199                         packet_dev_mc(dev, i, what);
1200         }
1201 }
1202
1203 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1204 {
1205         struct packet_sock *po = pkt_sk(sk);
1206         struct packet_mclist *ml, *i;
1207         struct net_device *dev;
1208         int err;
1209
1210         rtnl_lock();
1211
1212         err = -ENODEV;
1213         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1214         if (!dev)
1215                 goto done;
1216
1217         err = -EINVAL;
1218         if (mreq->mr_alen > dev->addr_len)
1219                 goto done;
1220
1221         err = -ENOBUFS;
1222         i = kmalloc(sizeof(*i), GFP_KERNEL);
1223         if (i == NULL)
1224                 goto done;
1225
1226         err = 0;
1227         for (ml = po->mclist; ml; ml = ml->next) {
1228                 if (ml->ifindex == mreq->mr_ifindex &&
1229                     ml->type == mreq->mr_type &&
1230                     ml->alen == mreq->mr_alen &&
1231                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1232                         ml->count++;
1233                         /* Free the new element ... */
1234                         kfree(i);
1235                         goto done;
1236                 }
1237         }
1238
1239         i->type = mreq->mr_type;
1240         i->ifindex = mreq->mr_ifindex;
1241         i->alen = mreq->mr_alen;
1242         memcpy(i->addr, mreq->mr_address, i->alen);
1243         i->count = 1;
1244         i->next = po->mclist;
1245         po->mclist = i;
1246         packet_dev_mc(dev, i, +1);
1247
1248 done:
1249         rtnl_unlock();
1250         return err;
1251 }
1252
1253 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1254 {
1255         struct packet_mclist *ml, **mlp;
1256
1257         rtnl_lock();
1258
1259         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1260                 if (ml->ifindex == mreq->mr_ifindex &&
1261                     ml->type == mreq->mr_type &&
1262                     ml->alen == mreq->mr_alen &&
1263                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1264                         if (--ml->count == 0) {
1265                                 struct net_device *dev;
1266                                 *mlp = ml->next;
1267                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1268                                 if (dev) {
1269                                         packet_dev_mc(dev, ml, -1);
1270                                         dev_put(dev);
1271                                 }
1272                                 kfree(ml);
1273                         }
1274                         rtnl_unlock();
1275                         return 0;
1276                 }
1277         }
1278         rtnl_unlock();
1279         return -EADDRNOTAVAIL;
1280 }
1281
1282 static void packet_flush_mclist(struct sock *sk)
1283 {
1284         struct packet_sock *po = pkt_sk(sk);
1285         struct packet_mclist *ml;
1286
1287         if (!po->mclist)
1288                 return;
1289
1290         rtnl_lock();
1291         while ((ml = po->mclist) != NULL) {
1292                 struct net_device *dev;
1293
1294                 po->mclist = ml->next;
1295                 if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1296                         packet_dev_mc(dev, ml, -1);
1297                         dev_put(dev);
1298                 }
1299                 kfree(ml);
1300         }
1301         rtnl_unlock();
1302 }
1303
1304 static int
1305 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1306 {
1307         struct sock *sk = sock->sk;
1308         struct packet_sock *po = pkt_sk(sk);
1309         int ret;
1310
1311         if (level != SOL_PACKET)
1312                 return -ENOPROTOOPT;
1313
1314         switch(optname) {
1315         case PACKET_ADD_MEMBERSHIP:
1316         case PACKET_DROP_MEMBERSHIP:
1317         {
1318                 struct packet_mreq_max mreq;
1319                 int len = optlen;
1320                 memset(&mreq, 0, sizeof(mreq));
1321                 if (len < sizeof(struct packet_mreq))
1322                         return -EINVAL;
1323                 if (len > sizeof(mreq))
1324                         len = sizeof(mreq);
1325                 if (copy_from_user(&mreq,optval,len))
1326                         return -EFAULT;
1327                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1328                         return -EINVAL;
1329                 if (optname == PACKET_ADD_MEMBERSHIP)
1330                         ret = packet_mc_add(sk, &mreq);
1331                 else
1332                         ret = packet_mc_drop(sk, &mreq);
1333                 return ret;
1334         }
1335
1336 #ifdef CONFIG_PACKET_MMAP
1337         case PACKET_RX_RING:
1338         {
1339                 struct tpacket_req req;
1340
1341                 if (optlen<sizeof(req))
1342                         return -EINVAL;
1343                 if (copy_from_user(&req,optval,sizeof(req)))
1344                         return -EFAULT;
1345                 return packet_set_ring(sk, &req, 0);
1346         }
1347         case PACKET_COPY_THRESH:
1348         {
1349                 int val;
1350
1351                 if (optlen!=sizeof(val))
1352                         return -EINVAL;
1353                 if (copy_from_user(&val,optval,sizeof(val)))
1354                         return -EFAULT;
1355
1356                 pkt_sk(sk)->copy_thresh = val;
1357                 return 0;
1358         }
1359 #endif
1360         case PACKET_AUXDATA:
1361         {
1362                 int val;
1363
1364                 if (optlen < sizeof(val))
1365                         return -EINVAL;
1366                 if (copy_from_user(&val, optval, sizeof(val)))
1367                         return -EFAULT;
1368
1369                 po->auxdata = !!val;
1370                 return 0;
1371         }
1372         case PACKET_ORIGDEV:
1373         {
1374                 int val;
1375
1376                 if (optlen < sizeof(val))
1377                         return -EINVAL;
1378                 if (copy_from_user(&val, optval, sizeof(val)))
1379                         return -EFAULT;
1380
1381                 po->origdev = !!val;
1382                 return 0;
1383         }
1384         default:
1385                 return -ENOPROTOOPT;
1386         }
1387 }
1388
1389 static int packet_getsockopt(struct socket *sock, int level, int optname,
1390                              char __user *optval, int __user *optlen)
1391 {
1392         int len;
1393         int val;
1394         struct sock *sk = sock->sk;
1395         struct packet_sock *po = pkt_sk(sk);
1396         void *data;
1397         struct tpacket_stats st;
1398
1399         if (level != SOL_PACKET)
1400                 return -ENOPROTOOPT;
1401
1402         if (get_user(len, optlen))
1403                 return -EFAULT;
1404
1405         if (len < 0)
1406                 return -EINVAL;
1407
1408         switch(optname) {
1409         case PACKET_STATISTICS:
1410                 if (len > sizeof(struct tpacket_stats))
1411                         len = sizeof(struct tpacket_stats);
1412                 spin_lock_bh(&sk->sk_receive_queue.lock);
1413                 st = po->stats;
1414                 memset(&po->stats, 0, sizeof(st));
1415                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1416                 st.tp_packets += st.tp_drops;
1417
1418                 data = &st;
1419                 break;
1420         case PACKET_AUXDATA:
1421                 if (len > sizeof(int))
1422                         len = sizeof(int);
1423                 val = po->auxdata;
1424
1425                 data = &val;
1426                 break;
1427         case PACKET_ORIGDEV:
1428                 if (len > sizeof(int))
1429                         len = sizeof(int);
1430                 val = po->origdev;
1431
1432                 data = &val;
1433                 break;
1434         default:
1435                 return -ENOPROTOOPT;
1436         }
1437
1438         if (put_user(len, optlen))
1439                 return -EFAULT;
1440         if (copy_to_user(optval, data, len))
1441                 return -EFAULT;
1442         return 0;
1443 }
1444
1445
1446 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1447 {
1448         struct sock *sk;
1449         struct hlist_node *node;
1450         struct net_device *dev = data;
1451         struct net *net = dev_net(dev);
1452
1453         read_lock(&net->packet.sklist_lock);
1454         sk_for_each(sk, node, &net->packet.sklist) {
1455                 struct packet_sock *po = pkt_sk(sk);
1456
1457                 switch (msg) {
1458                 case NETDEV_UNREGISTER:
1459                         if (po->mclist)
1460                                 packet_dev_mclist(dev, po->mclist, -1);
1461                         /* fallthrough */
1462
1463                 case NETDEV_DOWN:
1464                         if (dev->ifindex == po->ifindex) {
1465                                 spin_lock(&po->bind_lock);
1466                                 if (po->running) {
1467                                         __dev_remove_pack(&po->prot_hook);
1468                                         __sock_put(sk);
1469                                         po->running = 0;
1470                                         sk->sk_err = ENETDOWN;
1471                                         if (!sock_flag(sk, SOCK_DEAD))
1472                                                 sk->sk_error_report(sk);
1473                                 }
1474                                 if (msg == NETDEV_UNREGISTER) {
1475                                         po->ifindex = -1;
1476                                         po->prot_hook.dev = NULL;
1477                                 }
1478                                 spin_unlock(&po->bind_lock);
1479                         }
1480                         break;
1481                 case NETDEV_UP:
1482                         spin_lock(&po->bind_lock);
1483                         if (dev->ifindex == po->ifindex && po->num &&
1484                             !po->running) {
1485                                 dev_add_pack(&po->prot_hook);
1486                                 sock_hold(sk);
1487                                 po->running = 1;
1488                         }
1489                         spin_unlock(&po->bind_lock);
1490                         break;
1491                 }
1492         }
1493         read_unlock(&net->packet.sklist_lock);
1494         return NOTIFY_DONE;
1495 }
1496
1497
1498 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1499                         unsigned long arg)
1500 {
1501         struct sock *sk = sock->sk;
1502
1503         switch(cmd) {
1504                 case SIOCOUTQ:
1505                 {
1506                         int amount = atomic_read(&sk->sk_wmem_alloc);
1507                         return put_user(amount, (int __user *)arg);
1508                 }
1509                 case SIOCINQ:
1510                 {
1511                         struct sk_buff *skb;
1512                         int amount = 0;
1513
1514                         spin_lock_bh(&sk->sk_receive_queue.lock);
1515                         skb = skb_peek(&sk->sk_receive_queue);
1516                         if (skb)
1517                                 amount = skb->len;
1518                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1519                         return put_user(amount, (int __user *)arg);
1520                 }
1521                 case SIOCGSTAMP:
1522                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1523                 case SIOCGSTAMPNS:
1524                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1525
1526 #ifdef CONFIG_INET
1527                 case SIOCADDRT:
1528                 case SIOCDELRT:
1529                 case SIOCDARP:
1530                 case SIOCGARP:
1531                 case SIOCSARP:
1532                 case SIOCGIFADDR:
1533                 case SIOCSIFADDR:
1534                 case SIOCGIFBRDADDR:
1535                 case SIOCSIFBRDADDR:
1536                 case SIOCGIFNETMASK:
1537                 case SIOCSIFNETMASK:
1538                 case SIOCGIFDSTADDR:
1539                 case SIOCSIFDSTADDR:
1540                 case SIOCSIFFLAGS:
1541                         if (sock_net(sk) != &init_net)
1542                                 return -ENOIOCTLCMD;
1543                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1544 #endif
1545
1546                 default:
1547                         return -ENOIOCTLCMD;
1548         }
1549         return 0;
1550 }
1551
1552 #ifndef CONFIG_PACKET_MMAP
1553 #define packet_mmap sock_no_mmap
1554 #define packet_poll datagram_poll
1555 #else
1556
1557 static unsigned int packet_poll(struct file * file, struct socket *sock,
1558                                 poll_table *wait)
1559 {
1560         struct sock *sk = sock->sk;
1561         struct packet_sock *po = pkt_sk(sk);
1562         unsigned int mask = datagram_poll(file, sock, wait);
1563
1564         spin_lock_bh(&sk->sk_receive_queue.lock);
1565         if (po->pg_vec) {
1566                 unsigned last = po->head ? po->head-1 : po->frame_max;
1567                 struct tpacket_hdr *h;
1568
1569                 h = packet_lookup_frame(po, last);
1570
1571                 if (h->tp_status)
1572                         mask |= POLLIN | POLLRDNORM;
1573         }
1574         spin_unlock_bh(&sk->sk_receive_queue.lock);
1575         return mask;
1576 }
1577
1578
1579 /* Dirty? Well, I still did not learn better way to account
1580  * for user mmaps.
1581  */
1582
1583 static void packet_mm_open(struct vm_area_struct *vma)
1584 {
1585         struct file *file = vma->vm_file;
1586         struct socket * sock = file->private_data;
1587         struct sock *sk = sock->sk;
1588
1589         if (sk)
1590                 atomic_inc(&pkt_sk(sk)->mapped);
1591 }
1592
1593 static void packet_mm_close(struct vm_area_struct *vma)
1594 {
1595         struct file *file = vma->vm_file;
1596         struct socket * sock = file->private_data;
1597         struct sock *sk = sock->sk;
1598
1599         if (sk)
1600                 atomic_dec(&pkt_sk(sk)->mapped);
1601 }
1602
1603 static struct vm_operations_struct packet_mmap_ops = {
1604         .open = packet_mm_open,
1605         .close =packet_mm_close,
1606 };
1607
1608 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1609 {
1610         int i;
1611
1612         for (i = 0; i < len; i++) {
1613                 if (likely(pg_vec[i]))
1614                         free_pages((unsigned long) pg_vec[i], order);
1615         }
1616         kfree(pg_vec);
1617 }
1618
1619 static inline char *alloc_one_pg_vec_page(unsigned long order)
1620 {
1621         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1622                                          order);
1623 }
1624
1625 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1626 {
1627         unsigned int block_nr = req->tp_block_nr;
1628         char **pg_vec;
1629         int i;
1630
1631         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1632         if (unlikely(!pg_vec))
1633                 goto out;
1634
1635         for (i = 0; i < block_nr; i++) {
1636                 pg_vec[i] = alloc_one_pg_vec_page(order);
1637                 if (unlikely(!pg_vec[i]))
1638                         goto out_free_pgvec;
1639         }
1640
1641 out:
1642         return pg_vec;
1643
1644 out_free_pgvec:
1645         free_pg_vec(pg_vec, order, block_nr);
1646         pg_vec = NULL;
1647         goto out;
1648 }
1649
1650 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1651 {
1652         char **pg_vec = NULL;
1653         struct packet_sock *po = pkt_sk(sk);
1654         int was_running, order = 0;
1655         __be16 num;
1656         int err = 0;
1657
1658         if (req->tp_block_nr) {
1659                 int i;
1660
1661                 /* Sanity tests and some calculations */
1662
1663                 if (unlikely(po->pg_vec))
1664                         return -EBUSY;
1665
1666                 if (unlikely((int)req->tp_block_size <= 0))
1667                         return -EINVAL;
1668                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1669                         return -EINVAL;
1670                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1671                         return -EINVAL;
1672                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1673                         return -EINVAL;
1674
1675                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1676                 if (unlikely(po->frames_per_block <= 0))
1677                         return -EINVAL;
1678                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1679                              req->tp_frame_nr))
1680                         return -EINVAL;
1681
1682                 err = -ENOMEM;
1683                 order = get_order(req->tp_block_size);
1684                 pg_vec = alloc_pg_vec(req, order);
1685                 if (unlikely(!pg_vec))
1686                         goto out;
1687
1688                 for (i = 0; i < req->tp_block_nr; i++) {
1689                         char *ptr = pg_vec[i];
1690                         struct tpacket_hdr *header;
1691                         int k;
1692
1693                         for (k = 0; k < po->frames_per_block; k++) {
1694                                 header = (struct tpacket_hdr *) ptr;
1695                                 header->tp_status = TP_STATUS_KERNEL;
1696                                 ptr += req->tp_frame_size;
1697                         }
1698                 }
1699                 /* Done */
1700         } else {
1701                 if (unlikely(req->tp_frame_nr))
1702                         return -EINVAL;
1703         }
1704
1705         lock_sock(sk);
1706
1707         /* Detach socket from network */
1708         spin_lock(&po->bind_lock);
1709         was_running = po->running;
1710         num = po->num;
1711         if (was_running) {
1712                 __dev_remove_pack(&po->prot_hook);
1713                 po->num = 0;
1714                 po->running = 0;
1715                 __sock_put(sk);
1716         }
1717         spin_unlock(&po->bind_lock);
1718
1719         synchronize_net();
1720
1721         err = -EBUSY;
1722         if (closing || atomic_read(&po->mapped) == 0) {
1723                 err = 0;
1724 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1725
1726                 spin_lock_bh(&sk->sk_receive_queue.lock);
1727                 pg_vec = XC(po->pg_vec, pg_vec);
1728                 po->frame_max = (req->tp_frame_nr - 1);
1729                 po->head = 0;
1730                 po->frame_size = req->tp_frame_size;
1731                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1732
1733                 order = XC(po->pg_vec_order, order);
1734                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1735
1736                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1737                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1738                 skb_queue_purge(&sk->sk_receive_queue);
1739 #undef XC
1740                 if (atomic_read(&po->mapped))
1741                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1742         }
1743
1744         spin_lock(&po->bind_lock);
1745         if (was_running && !po->running) {
1746                 sock_hold(sk);
1747                 po->running = 1;
1748                 po->num = num;
1749                 dev_add_pack(&po->prot_hook);
1750         }
1751         spin_unlock(&po->bind_lock);
1752
1753         release_sock(sk);
1754
1755         if (pg_vec)
1756                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1757 out:
1758         return err;
1759 }
1760
1761 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1762 {
1763         struct sock *sk = sock->sk;
1764         struct packet_sock *po = pkt_sk(sk);
1765         unsigned long size;
1766         unsigned long start;
1767         int err = -EINVAL;
1768         int i;
1769
1770         if (vma->vm_pgoff)
1771                 return -EINVAL;
1772
1773         size = vma->vm_end - vma->vm_start;
1774
1775         lock_sock(sk);
1776         if (po->pg_vec == NULL)
1777                 goto out;
1778         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1779                 goto out;
1780
1781         start = vma->vm_start;
1782         for (i = 0; i < po->pg_vec_len; i++) {
1783                 struct page *page = virt_to_page(po->pg_vec[i]);
1784                 int pg_num;
1785
1786                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1787                         err = vm_insert_page(vma, start, page);
1788                         if (unlikely(err))
1789                                 goto out;
1790                         start += PAGE_SIZE;
1791                 }
1792         }
1793         atomic_inc(&po->mapped);
1794         vma->vm_ops = &packet_mmap_ops;
1795         err = 0;
1796
1797 out:
1798         release_sock(sk);
1799         return err;
1800 }
1801 #endif
1802
1803
1804 static const struct proto_ops packet_ops_spkt = {
1805         .family =       PF_PACKET,
1806         .owner =        THIS_MODULE,
1807         .release =      packet_release,
1808         .bind =         packet_bind_spkt,
1809         .connect =      sock_no_connect,
1810         .socketpair =   sock_no_socketpair,
1811         .accept =       sock_no_accept,
1812         .getname =      packet_getname_spkt,
1813         .poll =         datagram_poll,
1814         .ioctl =        packet_ioctl,
1815         .listen =       sock_no_listen,
1816         .shutdown =     sock_no_shutdown,
1817         .setsockopt =   sock_no_setsockopt,
1818         .getsockopt =   sock_no_getsockopt,
1819         .sendmsg =      packet_sendmsg_spkt,
1820         .recvmsg =      packet_recvmsg,
1821         .mmap =         sock_no_mmap,
1822         .sendpage =     sock_no_sendpage,
1823 };
1824
1825 static const struct proto_ops packet_ops = {
1826         .family =       PF_PACKET,
1827         .owner =        THIS_MODULE,
1828         .release =      packet_release,
1829         .bind =         packet_bind,
1830         .connect =      sock_no_connect,
1831         .socketpair =   sock_no_socketpair,
1832         .accept =       sock_no_accept,
1833         .getname =      packet_getname,
1834         .poll =         packet_poll,
1835         .ioctl =        packet_ioctl,
1836         .listen =       sock_no_listen,
1837         .shutdown =     sock_no_shutdown,
1838         .setsockopt =   packet_setsockopt,
1839         .getsockopt =   packet_getsockopt,
1840         .sendmsg =      packet_sendmsg,
1841         .recvmsg =      packet_recvmsg,
1842         .mmap =         packet_mmap,
1843         .sendpage =     sock_no_sendpage,
1844 };
1845
1846 static struct net_proto_family packet_family_ops = {
1847         .family =       PF_PACKET,
1848         .create =       packet_create,
1849         .owner  =       THIS_MODULE,
1850 };
1851
1852 static struct notifier_block packet_netdev_notifier = {
1853         .notifier_call =packet_notifier,
1854 };
1855
1856 #ifdef CONFIG_PROC_FS
1857 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
1858 {
1859         struct sock *s;
1860         struct hlist_node *node;
1861
1862         sk_for_each(s, node, &net->packet.sklist) {
1863                 if (!off--)
1864                         return s;
1865         }
1866         return NULL;
1867 }
1868
1869 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1870         __acquires(seq_file_net(seq)->packet.sklist_lock)
1871 {
1872         struct net *net = seq_file_net(seq);
1873         read_lock(&net->packet.sklist_lock);
1874         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
1875 }
1876
1877 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1878 {
1879         struct net *net = seq_file_net(seq);
1880         ++*pos;
1881         return  (v == SEQ_START_TOKEN)
1882                 ? sk_head(&net->packet.sklist)
1883                 : sk_next((struct sock*)v) ;
1884 }
1885
1886 static void packet_seq_stop(struct seq_file *seq, void *v)
1887         __releases(seq_file_net(seq)->packet.sklist_lock)
1888 {
1889         struct net *net = seq_file_net(seq);
1890         read_unlock(&net->packet.sklist_lock);
1891 }
1892
1893 static int packet_seq_show(struct seq_file *seq, void *v)
1894 {
1895         if (v == SEQ_START_TOKEN)
1896                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1897         else {
1898                 struct sock *s = v;
1899                 const struct packet_sock *po = pkt_sk(s);
1900
1901                 seq_printf(seq,
1902                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1903                            s,
1904                            atomic_read(&s->sk_refcnt),
1905                            s->sk_type,
1906                            ntohs(po->num),
1907                            po->ifindex,
1908                            po->running,
1909                            atomic_read(&s->sk_rmem_alloc),
1910                            sock_i_uid(s),
1911                            sock_i_ino(s) );
1912         }
1913
1914         return 0;
1915 }
1916
1917 static const struct seq_operations packet_seq_ops = {
1918         .start  = packet_seq_start,
1919         .next   = packet_seq_next,
1920         .stop   = packet_seq_stop,
1921         .show   = packet_seq_show,
1922 };
1923
1924 static int packet_seq_open(struct inode *inode, struct file *file)
1925 {
1926         return seq_open_net(inode, file, &packet_seq_ops,
1927                             sizeof(struct seq_net_private));
1928 }
1929
1930 static const struct file_operations packet_seq_fops = {
1931         .owner          = THIS_MODULE,
1932         .open           = packet_seq_open,
1933         .read           = seq_read,
1934         .llseek         = seq_lseek,
1935         .release        = seq_release_net,
1936 };
1937
1938 #endif
1939
1940 static int packet_net_init(struct net *net)
1941 {
1942         rwlock_init(&net->packet.sklist_lock);
1943         INIT_HLIST_HEAD(&net->packet.sklist);
1944
1945         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
1946                 return -ENOMEM;
1947
1948         return 0;
1949 }
1950
1951 static void packet_net_exit(struct net *net)
1952 {
1953         proc_net_remove(net, "packet");
1954 }
1955
1956 static struct pernet_operations packet_net_ops = {
1957         .init = packet_net_init,
1958         .exit = packet_net_exit,
1959 };
1960
1961
1962 static void __exit packet_exit(void)
1963 {
1964         unregister_netdevice_notifier(&packet_netdev_notifier);
1965         unregister_pernet_subsys(&packet_net_ops);
1966         sock_unregister(PF_PACKET);
1967         proto_unregister(&packet_proto);
1968 }
1969
1970 static int __init packet_init(void)
1971 {
1972         int rc = proto_register(&packet_proto, 0);
1973
1974         if (rc != 0)
1975                 goto out;
1976
1977         sock_register(&packet_family_ops);
1978         register_pernet_subsys(&packet_net_ops);
1979         register_netdevice_notifier(&packet_netdev_notifier);
1980 out:
1981         return rc;
1982 }
1983
1984 module_init(packet_init);
1985 module_exit(packet_exit);
1986 MODULE_LICENSE("GPL");
1987 MODULE_ALIAS_NETPROTO(PF_PACKET);