Merge branch 'linus' into x86/timers
[linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist
141 {
142         struct packet_mclist    *next;
143         int                     ifindex;
144         int                     count;
145         unsigned short          type;
146         unsigned short          alen;
147         unsigned char           addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max
153 {
154         int             mr_ifindex;
155         unsigned short  mr_type;
156         unsigned short  mr_alen;
157         unsigned char   mr_address[MAX_ADDR_LEN];
158 };
159
160 #ifdef CONFIG_PACKET_MMAP
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
162 #endif
163
164 static void packet_flush_mclist(struct sock *sk);
165
166 struct packet_sock {
167         /* struct sock has to be the first member of packet_sock */
168         struct sock             sk;
169         struct tpacket_stats    stats;
170 #ifdef CONFIG_PACKET_MMAP
171         char *                  *pg_vec;
172         unsigned int            head;
173         unsigned int            frames_per_block;
174         unsigned int            frame_size;
175         unsigned int            frame_max;
176         int                     copy_thresh;
177 #endif
178         struct packet_type      prot_hook;
179         spinlock_t              bind_lock;
180         unsigned int            running:1,      /* prot_hook is attached*/
181                                 auxdata:1,
182                                 origdev:1;
183         int                     ifindex;        /* bound device         */
184         __be16                  num;
185         struct packet_mclist    *mclist;
186 #ifdef CONFIG_PACKET_MMAP
187         atomic_t                mapped;
188         unsigned int            pg_vec_order;
189         unsigned int            pg_vec_pages;
190         unsigned int            pg_vec_len;
191 #endif
192 };
193
194 struct packet_skb_cb {
195         unsigned int origlen;
196         union {
197                 struct sockaddr_pkt pkt;
198                 struct sockaddr_ll ll;
199         } sa;
200 };
201
202 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
203
204 #ifdef CONFIG_PACKET_MMAP
205
206 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
207 {
208         unsigned int pg_vec_pos, frame_offset;
209
210         pg_vec_pos = position / po->frames_per_block;
211         frame_offset = position % po->frames_per_block;
212
213         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
214 }
215 #endif
216
217 static inline struct packet_sock *pkt_sk(struct sock *sk)
218 {
219         return (struct packet_sock *)sk;
220 }
221
222 static void packet_sock_destruct(struct sock *sk)
223 {
224         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
225         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
226
227         if (!sock_flag(sk, SOCK_DEAD)) {
228                 printk("Attempt to release alive packet socket: %p\n", sk);
229                 return;
230         }
231
232         sk_refcnt_debug_dec(sk);
233 }
234
235
236 static const struct proto_ops packet_ops;
237
238 static const struct proto_ops packet_ops_spkt;
239
240 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
241 {
242         struct sock *sk;
243         struct sockaddr_pkt *spkt;
244
245         /*
246          *      When we registered the protocol we saved the socket in the data
247          *      field for just this event.
248          */
249
250         sk = pt->af_packet_priv;
251
252         /*
253          *      Yank back the headers [hope the device set this
254          *      right or kerboom...]
255          *
256          *      Incoming packets have ll header pulled,
257          *      push it back.
258          *
259          *      For outgoing ones skb->data == skb_mac_header(skb)
260          *      so that this procedure is noop.
261          */
262
263         if (skb->pkt_type == PACKET_LOOPBACK)
264                 goto out;
265
266         if (dev_net(dev) != sock_net(sk))
267                 goto out;
268
269         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
270                 goto oom;
271
272         /* drop any routing info */
273         dst_release(skb->dst);
274         skb->dst = NULL;
275
276         /* drop conntrack reference */
277         nf_reset(skb);
278
279         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
280
281         skb_push(skb, skb->data - skb_mac_header(skb));
282
283         /*
284          *      The SOCK_PACKET socket receives _all_ frames.
285          */
286
287         spkt->spkt_family = dev->type;
288         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
289         spkt->spkt_protocol = skb->protocol;
290
291         /*
292          *      Charge the memory to the socket. This is done specifically
293          *      to prevent sockets using all the memory up.
294          */
295
296         if (sock_queue_rcv_skb(sk,skb) == 0)
297                 return 0;
298
299 out:
300         kfree_skb(skb);
301 oom:
302         return 0;
303 }
304
305
306 /*
307  *      Output a raw packet to a device layer. This bypasses all the other
308  *      protocol layers and you must therefore supply it with a complete frame
309  */
310
311 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
312                                struct msghdr *msg, size_t len)
313 {
314         struct sock *sk = sock->sk;
315         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
316         struct sk_buff *skb;
317         struct net_device *dev;
318         __be16 proto=0;
319         int err;
320
321         /*
322          *      Get and verify the address.
323          */
324
325         if (saddr)
326         {
327                 if (msg->msg_namelen < sizeof(struct sockaddr))
328                         return(-EINVAL);
329                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
330                         proto=saddr->spkt_protocol;
331         }
332         else
333                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
334
335         /*
336          *      Find the device first to size check it
337          */
338
339         saddr->spkt_device[13] = 0;
340         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
341         err = -ENODEV;
342         if (dev == NULL)
343                 goto out_unlock;
344
345         err = -ENETDOWN;
346         if (!(dev->flags & IFF_UP))
347                 goto out_unlock;
348
349         /*
350          *      You may not queue a frame bigger than the mtu. This is the lowest level
351          *      raw protocol and you must do your own fragmentation at this level.
352          */
353
354         err = -EMSGSIZE;
355         if (len > dev->mtu + dev->hard_header_len)
356                 goto out_unlock;
357
358         err = -ENOBUFS;
359         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
360
361         /*
362          *      If the write buffer is full, then tough. At this level the user gets to
363          *      deal with the problem - do your own algorithmic backoffs. That's far
364          *      more flexible.
365          */
366
367         if (skb == NULL)
368                 goto out_unlock;
369
370         /*
371          *      Fill it in
372          */
373
374         /* FIXME: Save some space for broken drivers that write a
375          * hard header at transmission time by themselves. PPP is the
376          * notable one here. This should really be fixed at the driver level.
377          */
378         skb_reserve(skb, LL_RESERVED_SPACE(dev));
379         skb_reset_network_header(skb);
380
381         /* Try to align data part correctly */
382         if (dev->header_ops) {
383                 skb->data -= dev->hard_header_len;
384                 skb->tail -= dev->hard_header_len;
385                 if (len < dev->hard_header_len)
386                         skb_reset_network_header(skb);
387         }
388
389         /* Returns -EFAULT on error */
390         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
391         skb->protocol = proto;
392         skb->dev = dev;
393         skb->priority = sk->sk_priority;
394         if (err)
395                 goto out_free;
396
397         /*
398          *      Now send it
399          */
400
401         dev_queue_xmit(skb);
402         dev_put(dev);
403         return(len);
404
405 out_free:
406         kfree_skb(skb);
407 out_unlock:
408         if (dev)
409                 dev_put(dev);
410         return err;
411 }
412
413 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
414                                       unsigned int res)
415 {
416         struct sk_filter *filter;
417
418         rcu_read_lock_bh();
419         filter = rcu_dereference(sk->sk_filter);
420         if (filter != NULL)
421                 res = sk_run_filter(skb, filter->insns, filter->len);
422         rcu_read_unlock_bh();
423
424         return res;
425 }
426
427 /*
428    This function makes lazy skb cloning in hope that most of packets
429    are discarded by BPF.
430
431    Note tricky part: we DO mangle shared skb! skb->data, skb->len
432    and skb->cb are mangled. It works because (and until) packets
433    falling here are owned by current CPU. Output packets are cloned
434    by dev_queue_xmit_nit(), input packets are processed by net_bh
435    sequencially, so that if we return skb to original state on exit,
436    we will not harm anyone.
437  */
438
439 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
440 {
441         struct sock *sk;
442         struct sockaddr_ll *sll;
443         struct packet_sock *po;
444         u8 * skb_head = skb->data;
445         int skb_len = skb->len;
446         unsigned int snaplen, res;
447
448         if (skb->pkt_type == PACKET_LOOPBACK)
449                 goto drop;
450
451         sk = pt->af_packet_priv;
452         po = pkt_sk(sk);
453
454         if (dev_net(dev) != sock_net(sk))
455                 goto drop;
456
457         skb->dev = dev;
458
459         if (dev->header_ops) {
460                 /* The device has an explicit notion of ll header,
461                    exported to higher levels.
462
463                    Otherwise, the device hides datails of it frame
464                    structure, so that corresponding packet head
465                    never delivered to user.
466                  */
467                 if (sk->sk_type != SOCK_DGRAM)
468                         skb_push(skb, skb->data - skb_mac_header(skb));
469                 else if (skb->pkt_type == PACKET_OUTGOING) {
470                         /* Special case: outgoing packets have ll header at head */
471                         skb_pull(skb, skb_network_offset(skb));
472                 }
473         }
474
475         snaplen = skb->len;
476
477         res = run_filter(skb, sk, snaplen);
478         if (!res)
479                 goto drop_n_restore;
480         if (snaplen > res)
481                 snaplen = res;
482
483         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
484             (unsigned)sk->sk_rcvbuf)
485                 goto drop_n_acct;
486
487         if (skb_shared(skb)) {
488                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
489                 if (nskb == NULL)
490                         goto drop_n_acct;
491
492                 if (skb_head != skb->data) {
493                         skb->data = skb_head;
494                         skb->len = skb_len;
495                 }
496                 kfree_skb(skb);
497                 skb = nskb;
498         }
499
500         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
501                      sizeof(skb->cb));
502
503         sll = &PACKET_SKB_CB(skb)->sa.ll;
504         sll->sll_family = AF_PACKET;
505         sll->sll_hatype = dev->type;
506         sll->sll_protocol = skb->protocol;
507         sll->sll_pkttype = skb->pkt_type;
508         if (unlikely(po->origdev))
509                 sll->sll_ifindex = orig_dev->ifindex;
510         else
511                 sll->sll_ifindex = dev->ifindex;
512
513         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
514
515         PACKET_SKB_CB(skb)->origlen = skb->len;
516
517         if (pskb_trim(skb, snaplen))
518                 goto drop_n_acct;
519
520         skb_set_owner_r(skb, sk);
521         skb->dev = NULL;
522         dst_release(skb->dst);
523         skb->dst = NULL;
524
525         /* drop conntrack reference */
526         nf_reset(skb);
527
528         spin_lock(&sk->sk_receive_queue.lock);
529         po->stats.tp_packets++;
530         __skb_queue_tail(&sk->sk_receive_queue, skb);
531         spin_unlock(&sk->sk_receive_queue.lock);
532         sk->sk_data_ready(sk, skb->len);
533         return 0;
534
535 drop_n_acct:
536         spin_lock(&sk->sk_receive_queue.lock);
537         po->stats.tp_drops++;
538         spin_unlock(&sk->sk_receive_queue.lock);
539
540 drop_n_restore:
541         if (skb_head != skb->data && skb_shared(skb)) {
542                 skb->data = skb_head;
543                 skb->len = skb_len;
544         }
545 drop:
546         kfree_skb(skb);
547         return 0;
548 }
549
550 #ifdef CONFIG_PACKET_MMAP
551 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
552 {
553         struct sock *sk;
554         struct packet_sock *po;
555         struct sockaddr_ll *sll;
556         struct tpacket_hdr *h;
557         u8 * skb_head = skb->data;
558         int skb_len = skb->len;
559         unsigned int snaplen, res;
560         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
561         unsigned short macoff, netoff;
562         struct sk_buff *copy_skb = NULL;
563         struct timeval tv;
564
565         if (skb->pkt_type == PACKET_LOOPBACK)
566                 goto drop;
567
568         sk = pt->af_packet_priv;
569         po = pkt_sk(sk);
570
571         if (dev_net(dev) != sock_net(sk))
572                 goto drop;
573
574         if (dev->header_ops) {
575                 if (sk->sk_type != SOCK_DGRAM)
576                         skb_push(skb, skb->data - skb_mac_header(skb));
577                 else if (skb->pkt_type == PACKET_OUTGOING) {
578                         /* Special case: outgoing packets have ll header at head */
579                         skb_pull(skb, skb_network_offset(skb));
580                 }
581         }
582
583         if (skb->ip_summed == CHECKSUM_PARTIAL)
584                 status |= TP_STATUS_CSUMNOTREADY;
585
586         snaplen = skb->len;
587
588         res = run_filter(skb, sk, snaplen);
589         if (!res)
590                 goto drop_n_restore;
591         if (snaplen > res)
592                 snaplen = res;
593
594         if (sk->sk_type == SOCK_DGRAM) {
595                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
596         } else {
597                 unsigned maclen = skb_network_offset(skb);
598                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
599                 macoff = netoff - maclen;
600         }
601
602         if (macoff + snaplen > po->frame_size) {
603                 if (po->copy_thresh &&
604                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
605                     (unsigned)sk->sk_rcvbuf) {
606                         if (skb_shared(skb)) {
607                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
608                         } else {
609                                 copy_skb = skb_get(skb);
610                                 skb_head = skb->data;
611                         }
612                         if (copy_skb)
613                                 skb_set_owner_r(copy_skb, sk);
614                 }
615                 snaplen = po->frame_size - macoff;
616                 if ((int)snaplen < 0)
617                         snaplen = 0;
618         }
619
620         spin_lock(&sk->sk_receive_queue.lock);
621         h = packet_lookup_frame(po, po->head);
622
623         if (h->tp_status)
624                 goto ring_is_full;
625         po->head = po->head != po->frame_max ? po->head+1 : 0;
626         po->stats.tp_packets++;
627         if (copy_skb) {
628                 status |= TP_STATUS_COPY;
629                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
630         }
631         if (!po->stats.tp_drops)
632                 status &= ~TP_STATUS_LOSING;
633         spin_unlock(&sk->sk_receive_queue.lock);
634
635         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
636
637         h->tp_len = skb->len;
638         h->tp_snaplen = snaplen;
639         h->tp_mac = macoff;
640         h->tp_net = netoff;
641         if (skb->tstamp.tv64)
642                 tv = ktime_to_timeval(skb->tstamp);
643         else
644                 do_gettimeofday(&tv);
645         h->tp_sec = tv.tv_sec;
646         h->tp_usec = tv.tv_usec;
647
648         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
649         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
650         sll->sll_family = AF_PACKET;
651         sll->sll_hatype = dev->type;
652         sll->sll_protocol = skb->protocol;
653         sll->sll_pkttype = skb->pkt_type;
654         if (unlikely(po->origdev))
655                 sll->sll_ifindex = orig_dev->ifindex;
656         else
657                 sll->sll_ifindex = dev->ifindex;
658
659         h->tp_status = status;
660         smp_mb();
661
662         {
663                 struct page *p_start, *p_end;
664                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
665
666                 p_start = virt_to_page(h);
667                 p_end = virt_to_page(h_end);
668                 while (p_start <= p_end) {
669                         flush_dcache_page(p_start);
670                         p_start++;
671                 }
672         }
673
674         sk->sk_data_ready(sk, 0);
675
676 drop_n_restore:
677         if (skb_head != skb->data && skb_shared(skb)) {
678                 skb->data = skb_head;
679                 skb->len = skb_len;
680         }
681 drop:
682         kfree_skb(skb);
683         return 0;
684
685 ring_is_full:
686         po->stats.tp_drops++;
687         spin_unlock(&sk->sk_receive_queue.lock);
688
689         sk->sk_data_ready(sk, 0);
690         if (copy_skb)
691                 kfree_skb(copy_skb);
692         goto drop_n_restore;
693 }
694
695 #endif
696
697
698 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
699                           struct msghdr *msg, size_t len)
700 {
701         struct sock *sk = sock->sk;
702         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
703         struct sk_buff *skb;
704         struct net_device *dev;
705         __be16 proto;
706         unsigned char *addr;
707         int ifindex, err, reserve = 0;
708
709         /*
710          *      Get and verify the address.
711          */
712
713         if (saddr == NULL) {
714                 struct packet_sock *po = pkt_sk(sk);
715
716                 ifindex = po->ifindex;
717                 proto   = po->num;
718                 addr    = NULL;
719         } else {
720                 err = -EINVAL;
721                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
722                         goto out;
723                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
724                         goto out;
725                 ifindex = saddr->sll_ifindex;
726                 proto   = saddr->sll_protocol;
727                 addr    = saddr->sll_addr;
728         }
729
730
731         dev = dev_get_by_index(sock_net(sk), ifindex);
732         err = -ENXIO;
733         if (dev == NULL)
734                 goto out_unlock;
735         if (sock->type == SOCK_RAW)
736                 reserve = dev->hard_header_len;
737
738         err = -ENETDOWN;
739         if (!(dev->flags & IFF_UP))
740                 goto out_unlock;
741
742         err = -EMSGSIZE;
743         if (len > dev->mtu+reserve)
744                 goto out_unlock;
745
746         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
747                                 msg->msg_flags & MSG_DONTWAIT, &err);
748         if (skb==NULL)
749                 goto out_unlock;
750
751         skb_reserve(skb, LL_RESERVED_SPACE(dev));
752         skb_reset_network_header(skb);
753
754         err = -EINVAL;
755         if (sock->type == SOCK_DGRAM &&
756             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
757                 goto out_free;
758
759         /* Returns -EFAULT on error */
760         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
761         if (err)
762                 goto out_free;
763
764         skb->protocol = proto;
765         skb->dev = dev;
766         skb->priority = sk->sk_priority;
767
768         /*
769          *      Now send it
770          */
771
772         err = dev_queue_xmit(skb);
773         if (err > 0 && (err = net_xmit_errno(err)) != 0)
774                 goto out_unlock;
775
776         dev_put(dev);
777
778         return(len);
779
780 out_free:
781         kfree_skb(skb);
782 out_unlock:
783         if (dev)
784                 dev_put(dev);
785 out:
786         return err;
787 }
788
789 /*
790  *      Close a PACKET socket. This is fairly simple. We immediately go
791  *      to 'closed' state and remove our protocol entry in the device list.
792  */
793
794 static int packet_release(struct socket *sock)
795 {
796         struct sock *sk = sock->sk;
797         struct packet_sock *po;
798         struct net *net;
799
800         if (!sk)
801                 return 0;
802
803         net = sock_net(sk);
804         po = pkt_sk(sk);
805
806         write_lock_bh(&net->packet.sklist_lock);
807         sk_del_node_init(sk);
808         write_unlock_bh(&net->packet.sklist_lock);
809
810         /*
811          *      Unhook packet receive handler.
812          */
813
814         if (po->running) {
815                 /*
816                  *      Remove the protocol hook
817                  */
818                 dev_remove_pack(&po->prot_hook);
819                 po->running = 0;
820                 po->num = 0;
821                 __sock_put(sk);
822         }
823
824         packet_flush_mclist(sk);
825
826 #ifdef CONFIG_PACKET_MMAP
827         if (po->pg_vec) {
828                 struct tpacket_req req;
829                 memset(&req, 0, sizeof(req));
830                 packet_set_ring(sk, &req, 1);
831         }
832 #endif
833
834         /*
835          *      Now the socket is dead. No more input will appear.
836          */
837
838         sock_orphan(sk);
839         sock->sk = NULL;
840
841         /* Purge queues */
842
843         skb_queue_purge(&sk->sk_receive_queue);
844         sk_refcnt_debug_release(sk);
845
846         sock_put(sk);
847         return 0;
848 }
849
850 /*
851  *      Attach a packet hook.
852  */
853
854 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
855 {
856         struct packet_sock *po = pkt_sk(sk);
857         /*
858          *      Detach an existing hook if present.
859          */
860
861         lock_sock(sk);
862
863         spin_lock(&po->bind_lock);
864         if (po->running) {
865                 __sock_put(sk);
866                 po->running = 0;
867                 po->num = 0;
868                 spin_unlock(&po->bind_lock);
869                 dev_remove_pack(&po->prot_hook);
870                 spin_lock(&po->bind_lock);
871         }
872
873         po->num = protocol;
874         po->prot_hook.type = protocol;
875         po->prot_hook.dev = dev;
876
877         po->ifindex = dev ? dev->ifindex : 0;
878
879         if (protocol == 0)
880                 goto out_unlock;
881
882         if (!dev || (dev->flags & IFF_UP)) {
883                 dev_add_pack(&po->prot_hook);
884                 sock_hold(sk);
885                 po->running = 1;
886         } else {
887                 sk->sk_err = ENETDOWN;
888                 if (!sock_flag(sk, SOCK_DEAD))
889                         sk->sk_error_report(sk);
890         }
891
892 out_unlock:
893         spin_unlock(&po->bind_lock);
894         release_sock(sk);
895         return 0;
896 }
897
898 /*
899  *      Bind a packet socket to a device
900  */
901
902 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
903 {
904         struct sock *sk=sock->sk;
905         char name[15];
906         struct net_device *dev;
907         int err = -ENODEV;
908
909         /*
910          *      Check legality
911          */
912
913         if (addr_len != sizeof(struct sockaddr))
914                 return -EINVAL;
915         strlcpy(name,uaddr->sa_data,sizeof(name));
916
917         dev = dev_get_by_name(sock_net(sk), name);
918         if (dev) {
919                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
920                 dev_put(dev);
921         }
922         return err;
923 }
924
925 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
926 {
927         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
928         struct sock *sk=sock->sk;
929         struct net_device *dev = NULL;
930         int err;
931
932
933         /*
934          *      Check legality
935          */
936
937         if (addr_len < sizeof(struct sockaddr_ll))
938                 return -EINVAL;
939         if (sll->sll_family != AF_PACKET)
940                 return -EINVAL;
941
942         if (sll->sll_ifindex) {
943                 err = -ENODEV;
944                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
945                 if (dev == NULL)
946                         goto out;
947         }
948         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
949         if (dev)
950                 dev_put(dev);
951
952 out:
953         return err;
954 }
955
956 static struct proto packet_proto = {
957         .name     = "PACKET",
958         .owner    = THIS_MODULE,
959         .obj_size = sizeof(struct packet_sock),
960 };
961
962 /*
963  *      Create a packet of type SOCK_PACKET.
964  */
965
966 static int packet_create(struct net *net, struct socket *sock, int protocol)
967 {
968         struct sock *sk;
969         struct packet_sock *po;
970         __be16 proto = (__force __be16)protocol; /* weird, but documented */
971         int err;
972
973         if (!capable(CAP_NET_RAW))
974                 return -EPERM;
975         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
976             sock->type != SOCK_PACKET)
977                 return -ESOCKTNOSUPPORT;
978
979         sock->state = SS_UNCONNECTED;
980
981         err = -ENOBUFS;
982         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
983         if (sk == NULL)
984                 goto out;
985
986         sock->ops = &packet_ops;
987         if (sock->type == SOCK_PACKET)
988                 sock->ops = &packet_ops_spkt;
989
990         sock_init_data(sock, sk);
991
992         po = pkt_sk(sk);
993         sk->sk_family = PF_PACKET;
994         po->num = proto;
995
996         sk->sk_destruct = packet_sock_destruct;
997         sk_refcnt_debug_inc(sk);
998
999         /*
1000          *      Attach a protocol block
1001          */
1002
1003         spin_lock_init(&po->bind_lock);
1004         po->prot_hook.func = packet_rcv;
1005
1006         if (sock->type == SOCK_PACKET)
1007                 po->prot_hook.func = packet_rcv_spkt;
1008
1009         po->prot_hook.af_packet_priv = sk;
1010
1011         if (proto) {
1012                 po->prot_hook.type = proto;
1013                 dev_add_pack(&po->prot_hook);
1014                 sock_hold(sk);
1015                 po->running = 1;
1016         }
1017
1018         write_lock_bh(&net->packet.sklist_lock);
1019         sk_add_node(sk, &net->packet.sklist);
1020         write_unlock_bh(&net->packet.sklist_lock);
1021         return(0);
1022 out:
1023         return err;
1024 }
1025
1026 /*
1027  *      Pull a packet from our receive queue and hand it to the user.
1028  *      If necessary we block.
1029  */
1030
1031 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1032                           struct msghdr *msg, size_t len, int flags)
1033 {
1034         struct sock *sk = sock->sk;
1035         struct sk_buff *skb;
1036         int copied, err;
1037         struct sockaddr_ll *sll;
1038
1039         err = -EINVAL;
1040         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1041                 goto out;
1042
1043 #if 0
1044         /* What error should we return now? EUNATTACH? */
1045         if (pkt_sk(sk)->ifindex < 0)
1046                 return -ENODEV;
1047 #endif
1048
1049         /*
1050          *      Call the generic datagram receiver. This handles all sorts
1051          *      of horrible races and re-entrancy so we can forget about it
1052          *      in the protocol layers.
1053          *
1054          *      Now it will return ENETDOWN, if device have just gone down,
1055          *      but then it will block.
1056          */
1057
1058         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1059
1060         /*
1061          *      An error occurred so return it. Because skb_recv_datagram()
1062          *      handles the blocking we don't see and worry about blocking
1063          *      retries.
1064          */
1065
1066         if (skb == NULL)
1067                 goto out;
1068
1069         /*
1070          *      If the address length field is there to be filled in, we fill
1071          *      it in now.
1072          */
1073
1074         sll = &PACKET_SKB_CB(skb)->sa.ll;
1075         if (sock->type == SOCK_PACKET)
1076                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1077         else
1078                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1079
1080         /*
1081          *      You lose any data beyond the buffer you gave. If it worries a
1082          *      user program they can ask the device for its MTU anyway.
1083          */
1084
1085         copied = skb->len;
1086         if (copied > len)
1087         {
1088                 copied=len;
1089                 msg->msg_flags|=MSG_TRUNC;
1090         }
1091
1092         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1093         if (err)
1094                 goto out_free;
1095
1096         sock_recv_timestamp(msg, sk, skb);
1097
1098         if (msg->msg_name)
1099                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1100                        msg->msg_namelen);
1101
1102         if (pkt_sk(sk)->auxdata) {
1103                 struct tpacket_auxdata aux;
1104
1105                 aux.tp_status = TP_STATUS_USER;
1106                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1107                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1108                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1109                 aux.tp_snaplen = skb->len;
1110                 aux.tp_mac = 0;
1111                 aux.tp_net = skb_network_offset(skb);
1112
1113                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1114         }
1115
1116         /*
1117          *      Free or return the buffer as appropriate. Again this
1118          *      hides all the races and re-entrancy issues from us.
1119          */
1120         err = (flags&MSG_TRUNC) ? skb->len : copied;
1121
1122 out_free:
1123         skb_free_datagram(sk, skb);
1124 out:
1125         return err;
1126 }
1127
1128 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1129                                int *uaddr_len, int peer)
1130 {
1131         struct net_device *dev;
1132         struct sock *sk = sock->sk;
1133
1134         if (peer)
1135                 return -EOPNOTSUPP;
1136
1137         uaddr->sa_family = AF_PACKET;
1138         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1139         if (dev) {
1140                 strlcpy(uaddr->sa_data, dev->name, 15);
1141                 dev_put(dev);
1142         } else
1143                 memset(uaddr->sa_data, 0, 14);
1144         *uaddr_len = sizeof(*uaddr);
1145
1146         return 0;
1147 }
1148
1149 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1150                           int *uaddr_len, int peer)
1151 {
1152         struct net_device *dev;
1153         struct sock *sk = sock->sk;
1154         struct packet_sock *po = pkt_sk(sk);
1155         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1156
1157         if (peer)
1158                 return -EOPNOTSUPP;
1159
1160         sll->sll_family = AF_PACKET;
1161         sll->sll_ifindex = po->ifindex;
1162         sll->sll_protocol = po->num;
1163         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1164         if (dev) {
1165                 sll->sll_hatype = dev->type;
1166                 sll->sll_halen = dev->addr_len;
1167                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1168                 dev_put(dev);
1169         } else {
1170                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1171                 sll->sll_halen = 0;
1172         }
1173         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1174
1175         return 0;
1176 }
1177
1178 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1179 {
1180         switch (i->type) {
1181         case PACKET_MR_MULTICAST:
1182                 if (what > 0)
1183                         dev_mc_add(dev, i->addr, i->alen, 0);
1184                 else
1185                         dev_mc_delete(dev, i->addr, i->alen, 0);
1186                 break;
1187         case PACKET_MR_PROMISC:
1188                 dev_set_promiscuity(dev, what);
1189                 break;
1190         case PACKET_MR_ALLMULTI:
1191                 dev_set_allmulti(dev, what);
1192                 break;
1193         default:;
1194         }
1195 }
1196
1197 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1198 {
1199         for ( ; i; i=i->next) {
1200                 if (i->ifindex == dev->ifindex)
1201                         packet_dev_mc(dev, i, what);
1202         }
1203 }
1204
1205 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1206 {
1207         struct packet_sock *po = pkt_sk(sk);
1208         struct packet_mclist *ml, *i;
1209         struct net_device *dev;
1210         int err;
1211
1212         rtnl_lock();
1213
1214         err = -ENODEV;
1215         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1216         if (!dev)
1217                 goto done;
1218
1219         err = -EINVAL;
1220         if (mreq->mr_alen > dev->addr_len)
1221                 goto done;
1222
1223         err = -ENOBUFS;
1224         i = kmalloc(sizeof(*i), GFP_KERNEL);
1225         if (i == NULL)
1226                 goto done;
1227
1228         err = 0;
1229         for (ml = po->mclist; ml; ml = ml->next) {
1230                 if (ml->ifindex == mreq->mr_ifindex &&
1231                     ml->type == mreq->mr_type &&
1232                     ml->alen == mreq->mr_alen &&
1233                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1234                         ml->count++;
1235                         /* Free the new element ... */
1236                         kfree(i);
1237                         goto done;
1238                 }
1239         }
1240
1241         i->type = mreq->mr_type;
1242         i->ifindex = mreq->mr_ifindex;
1243         i->alen = mreq->mr_alen;
1244         memcpy(i->addr, mreq->mr_address, i->alen);
1245         i->count = 1;
1246         i->next = po->mclist;
1247         po->mclist = i;
1248         packet_dev_mc(dev, i, +1);
1249
1250 done:
1251         rtnl_unlock();
1252         return err;
1253 }
1254
1255 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1256 {
1257         struct packet_mclist *ml, **mlp;
1258
1259         rtnl_lock();
1260
1261         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1262                 if (ml->ifindex == mreq->mr_ifindex &&
1263                     ml->type == mreq->mr_type &&
1264                     ml->alen == mreq->mr_alen &&
1265                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1266                         if (--ml->count == 0) {
1267                                 struct net_device *dev;
1268                                 *mlp = ml->next;
1269                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1270                                 if (dev) {
1271                                         packet_dev_mc(dev, ml, -1);
1272                                         dev_put(dev);
1273                                 }
1274                                 kfree(ml);
1275                         }
1276                         rtnl_unlock();
1277                         return 0;
1278                 }
1279         }
1280         rtnl_unlock();
1281         return -EADDRNOTAVAIL;
1282 }
1283
1284 static void packet_flush_mclist(struct sock *sk)
1285 {
1286         struct packet_sock *po = pkt_sk(sk);
1287         struct packet_mclist *ml;
1288
1289         if (!po->mclist)
1290                 return;
1291
1292         rtnl_lock();
1293         while ((ml = po->mclist) != NULL) {
1294                 struct net_device *dev;
1295
1296                 po->mclist = ml->next;
1297                 if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1298                         packet_dev_mc(dev, ml, -1);
1299                         dev_put(dev);
1300                 }
1301                 kfree(ml);
1302         }
1303         rtnl_unlock();
1304 }
1305
1306 static int
1307 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1308 {
1309         struct sock *sk = sock->sk;
1310         struct packet_sock *po = pkt_sk(sk);
1311         int ret;
1312
1313         if (level != SOL_PACKET)
1314                 return -ENOPROTOOPT;
1315
1316         switch(optname) {
1317         case PACKET_ADD_MEMBERSHIP:
1318         case PACKET_DROP_MEMBERSHIP:
1319         {
1320                 struct packet_mreq_max mreq;
1321                 int len = optlen;
1322                 memset(&mreq, 0, sizeof(mreq));
1323                 if (len < sizeof(struct packet_mreq))
1324                         return -EINVAL;
1325                 if (len > sizeof(mreq))
1326                         len = sizeof(mreq);
1327                 if (copy_from_user(&mreq,optval,len))
1328                         return -EFAULT;
1329                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1330                         return -EINVAL;
1331                 if (optname == PACKET_ADD_MEMBERSHIP)
1332                         ret = packet_mc_add(sk, &mreq);
1333                 else
1334                         ret = packet_mc_drop(sk, &mreq);
1335                 return ret;
1336         }
1337
1338 #ifdef CONFIG_PACKET_MMAP
1339         case PACKET_RX_RING:
1340         {
1341                 struct tpacket_req req;
1342
1343                 if (optlen<sizeof(req))
1344                         return -EINVAL;
1345                 if (copy_from_user(&req,optval,sizeof(req)))
1346                         return -EFAULT;
1347                 return packet_set_ring(sk, &req, 0);
1348         }
1349         case PACKET_COPY_THRESH:
1350         {
1351                 int val;
1352
1353                 if (optlen!=sizeof(val))
1354                         return -EINVAL;
1355                 if (copy_from_user(&val,optval,sizeof(val)))
1356                         return -EFAULT;
1357
1358                 pkt_sk(sk)->copy_thresh = val;
1359                 return 0;
1360         }
1361 #endif
1362         case PACKET_AUXDATA:
1363         {
1364                 int val;
1365
1366                 if (optlen < sizeof(val))
1367                         return -EINVAL;
1368                 if (copy_from_user(&val, optval, sizeof(val)))
1369                         return -EFAULT;
1370
1371                 po->auxdata = !!val;
1372                 return 0;
1373         }
1374         case PACKET_ORIGDEV:
1375         {
1376                 int val;
1377
1378                 if (optlen < sizeof(val))
1379                         return -EINVAL;
1380                 if (copy_from_user(&val, optval, sizeof(val)))
1381                         return -EFAULT;
1382
1383                 po->origdev = !!val;
1384                 return 0;
1385         }
1386         default:
1387                 return -ENOPROTOOPT;
1388         }
1389 }
1390
1391 static int packet_getsockopt(struct socket *sock, int level, int optname,
1392                              char __user *optval, int __user *optlen)
1393 {
1394         int len;
1395         int val;
1396         struct sock *sk = sock->sk;
1397         struct packet_sock *po = pkt_sk(sk);
1398         void *data;
1399         struct tpacket_stats st;
1400
1401         if (level != SOL_PACKET)
1402                 return -ENOPROTOOPT;
1403
1404         if (get_user(len, optlen))
1405                 return -EFAULT;
1406
1407         if (len < 0)
1408                 return -EINVAL;
1409
1410         switch(optname) {
1411         case PACKET_STATISTICS:
1412                 if (len > sizeof(struct tpacket_stats))
1413                         len = sizeof(struct tpacket_stats);
1414                 spin_lock_bh(&sk->sk_receive_queue.lock);
1415                 st = po->stats;
1416                 memset(&po->stats, 0, sizeof(st));
1417                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1418                 st.tp_packets += st.tp_drops;
1419
1420                 data = &st;
1421                 break;
1422         case PACKET_AUXDATA:
1423                 if (len > sizeof(int))
1424                         len = sizeof(int);
1425                 val = po->auxdata;
1426
1427                 data = &val;
1428                 break;
1429         case PACKET_ORIGDEV:
1430                 if (len > sizeof(int))
1431                         len = sizeof(int);
1432                 val = po->origdev;
1433
1434                 data = &val;
1435                 break;
1436         default:
1437                 return -ENOPROTOOPT;
1438         }
1439
1440         if (put_user(len, optlen))
1441                 return -EFAULT;
1442         if (copy_to_user(optval, data, len))
1443                 return -EFAULT;
1444         return 0;
1445 }
1446
1447
1448 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1449 {
1450         struct sock *sk;
1451         struct hlist_node *node;
1452         struct net_device *dev = data;
1453         struct net *net = dev_net(dev);
1454
1455         read_lock(&net->packet.sklist_lock);
1456         sk_for_each(sk, node, &net->packet.sklist) {
1457                 struct packet_sock *po = pkt_sk(sk);
1458
1459                 switch (msg) {
1460                 case NETDEV_UNREGISTER:
1461                         if (po->mclist)
1462                                 packet_dev_mclist(dev, po->mclist, -1);
1463                         /* fallthrough */
1464
1465                 case NETDEV_DOWN:
1466                         if (dev->ifindex == po->ifindex) {
1467                                 spin_lock(&po->bind_lock);
1468                                 if (po->running) {
1469                                         __dev_remove_pack(&po->prot_hook);
1470                                         __sock_put(sk);
1471                                         po->running = 0;
1472                                         sk->sk_err = ENETDOWN;
1473                                         if (!sock_flag(sk, SOCK_DEAD))
1474                                                 sk->sk_error_report(sk);
1475                                 }
1476                                 if (msg == NETDEV_UNREGISTER) {
1477                                         po->ifindex = -1;
1478                                         po->prot_hook.dev = NULL;
1479                                 }
1480                                 spin_unlock(&po->bind_lock);
1481                         }
1482                         break;
1483                 case NETDEV_UP:
1484                         spin_lock(&po->bind_lock);
1485                         if (dev->ifindex == po->ifindex && po->num &&
1486                             !po->running) {
1487                                 dev_add_pack(&po->prot_hook);
1488                                 sock_hold(sk);
1489                                 po->running = 1;
1490                         }
1491                         spin_unlock(&po->bind_lock);
1492                         break;
1493                 }
1494         }
1495         read_unlock(&net->packet.sklist_lock);
1496         return NOTIFY_DONE;
1497 }
1498
1499
1500 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1501                         unsigned long arg)
1502 {
1503         struct sock *sk = sock->sk;
1504
1505         switch(cmd) {
1506                 case SIOCOUTQ:
1507                 {
1508                         int amount = atomic_read(&sk->sk_wmem_alloc);
1509                         return put_user(amount, (int __user *)arg);
1510                 }
1511                 case SIOCINQ:
1512                 {
1513                         struct sk_buff *skb;
1514                         int amount = 0;
1515
1516                         spin_lock_bh(&sk->sk_receive_queue.lock);
1517                         skb = skb_peek(&sk->sk_receive_queue);
1518                         if (skb)
1519                                 amount = skb->len;
1520                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1521                         return put_user(amount, (int __user *)arg);
1522                 }
1523                 case SIOCGSTAMP:
1524                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1525                 case SIOCGSTAMPNS:
1526                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1527
1528 #ifdef CONFIG_INET
1529                 case SIOCADDRT:
1530                 case SIOCDELRT:
1531                 case SIOCDARP:
1532                 case SIOCGARP:
1533                 case SIOCSARP:
1534                 case SIOCGIFADDR:
1535                 case SIOCSIFADDR:
1536                 case SIOCGIFBRDADDR:
1537                 case SIOCSIFBRDADDR:
1538                 case SIOCGIFNETMASK:
1539                 case SIOCSIFNETMASK:
1540                 case SIOCGIFDSTADDR:
1541                 case SIOCSIFDSTADDR:
1542                 case SIOCSIFFLAGS:
1543                         if (sock_net(sk) != &init_net)
1544                                 return -ENOIOCTLCMD;
1545                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1546 #endif
1547
1548                 default:
1549                         return -ENOIOCTLCMD;
1550         }
1551         return 0;
1552 }
1553
1554 #ifndef CONFIG_PACKET_MMAP
1555 #define packet_mmap sock_no_mmap
1556 #define packet_poll datagram_poll
1557 #else
1558
1559 static unsigned int packet_poll(struct file * file, struct socket *sock,
1560                                 poll_table *wait)
1561 {
1562         struct sock *sk = sock->sk;
1563         struct packet_sock *po = pkt_sk(sk);
1564         unsigned int mask = datagram_poll(file, sock, wait);
1565
1566         spin_lock_bh(&sk->sk_receive_queue.lock);
1567         if (po->pg_vec) {
1568                 unsigned last = po->head ? po->head-1 : po->frame_max;
1569                 struct tpacket_hdr *h;
1570
1571                 h = packet_lookup_frame(po, last);
1572
1573                 if (h->tp_status)
1574                         mask |= POLLIN | POLLRDNORM;
1575         }
1576         spin_unlock_bh(&sk->sk_receive_queue.lock);
1577         return mask;
1578 }
1579
1580
1581 /* Dirty? Well, I still did not learn better way to account
1582  * for user mmaps.
1583  */
1584
1585 static void packet_mm_open(struct vm_area_struct *vma)
1586 {
1587         struct file *file = vma->vm_file;
1588         struct socket * sock = file->private_data;
1589         struct sock *sk = sock->sk;
1590
1591         if (sk)
1592                 atomic_inc(&pkt_sk(sk)->mapped);
1593 }
1594
1595 static void packet_mm_close(struct vm_area_struct *vma)
1596 {
1597         struct file *file = vma->vm_file;
1598         struct socket * sock = file->private_data;
1599         struct sock *sk = sock->sk;
1600
1601         if (sk)
1602                 atomic_dec(&pkt_sk(sk)->mapped);
1603 }
1604
1605 static struct vm_operations_struct packet_mmap_ops = {
1606         .open = packet_mm_open,
1607         .close =packet_mm_close,
1608 };
1609
1610 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1611 {
1612         int i;
1613
1614         for (i = 0; i < len; i++) {
1615                 if (likely(pg_vec[i]))
1616                         free_pages((unsigned long) pg_vec[i], order);
1617         }
1618         kfree(pg_vec);
1619 }
1620
1621 static inline char *alloc_one_pg_vec_page(unsigned long order)
1622 {
1623         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1624                                          order);
1625 }
1626
1627 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1628 {
1629         unsigned int block_nr = req->tp_block_nr;
1630         char **pg_vec;
1631         int i;
1632
1633         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1634         if (unlikely(!pg_vec))
1635                 goto out;
1636
1637         for (i = 0; i < block_nr; i++) {
1638                 pg_vec[i] = alloc_one_pg_vec_page(order);
1639                 if (unlikely(!pg_vec[i]))
1640                         goto out_free_pgvec;
1641         }
1642
1643 out:
1644         return pg_vec;
1645
1646 out_free_pgvec:
1647         free_pg_vec(pg_vec, order, block_nr);
1648         pg_vec = NULL;
1649         goto out;
1650 }
1651
1652 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1653 {
1654         char **pg_vec = NULL;
1655         struct packet_sock *po = pkt_sk(sk);
1656         int was_running, order = 0;
1657         __be16 num;
1658         int err = 0;
1659
1660         if (req->tp_block_nr) {
1661                 int i;
1662
1663                 /* Sanity tests and some calculations */
1664
1665                 if (unlikely(po->pg_vec))
1666                         return -EBUSY;
1667
1668                 if (unlikely((int)req->tp_block_size <= 0))
1669                         return -EINVAL;
1670                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1671                         return -EINVAL;
1672                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1673                         return -EINVAL;
1674                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1675                         return -EINVAL;
1676
1677                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1678                 if (unlikely(po->frames_per_block <= 0))
1679                         return -EINVAL;
1680                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1681                              req->tp_frame_nr))
1682                         return -EINVAL;
1683
1684                 err = -ENOMEM;
1685                 order = get_order(req->tp_block_size);
1686                 pg_vec = alloc_pg_vec(req, order);
1687                 if (unlikely(!pg_vec))
1688                         goto out;
1689
1690                 for (i = 0; i < req->tp_block_nr; i++) {
1691                         char *ptr = pg_vec[i];
1692                         struct tpacket_hdr *header;
1693                         int k;
1694
1695                         for (k = 0; k < po->frames_per_block; k++) {
1696                                 header = (struct tpacket_hdr *) ptr;
1697                                 header->tp_status = TP_STATUS_KERNEL;
1698                                 ptr += req->tp_frame_size;
1699                         }
1700                 }
1701                 /* Done */
1702         } else {
1703                 if (unlikely(req->tp_frame_nr))
1704                         return -EINVAL;
1705         }
1706
1707         lock_sock(sk);
1708
1709         /* Detach socket from network */
1710         spin_lock(&po->bind_lock);
1711         was_running = po->running;
1712         num = po->num;
1713         if (was_running) {
1714                 __dev_remove_pack(&po->prot_hook);
1715                 po->num = 0;
1716                 po->running = 0;
1717                 __sock_put(sk);
1718         }
1719         spin_unlock(&po->bind_lock);
1720
1721         synchronize_net();
1722
1723         err = -EBUSY;
1724         if (closing || atomic_read(&po->mapped) == 0) {
1725                 err = 0;
1726 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1727
1728                 spin_lock_bh(&sk->sk_receive_queue.lock);
1729                 pg_vec = XC(po->pg_vec, pg_vec);
1730                 po->frame_max = (req->tp_frame_nr - 1);
1731                 po->head = 0;
1732                 po->frame_size = req->tp_frame_size;
1733                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1734
1735                 order = XC(po->pg_vec_order, order);
1736                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1737
1738                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1739                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1740                 skb_queue_purge(&sk->sk_receive_queue);
1741 #undef XC
1742                 if (atomic_read(&po->mapped))
1743                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1744         }
1745
1746         spin_lock(&po->bind_lock);
1747         if (was_running && !po->running) {
1748                 sock_hold(sk);
1749                 po->running = 1;
1750                 po->num = num;
1751                 dev_add_pack(&po->prot_hook);
1752         }
1753         spin_unlock(&po->bind_lock);
1754
1755         release_sock(sk);
1756
1757         if (pg_vec)
1758                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1759 out:
1760         return err;
1761 }
1762
1763 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1764 {
1765         struct sock *sk = sock->sk;
1766         struct packet_sock *po = pkt_sk(sk);
1767         unsigned long size;
1768         unsigned long start;
1769         int err = -EINVAL;
1770         int i;
1771
1772         if (vma->vm_pgoff)
1773                 return -EINVAL;
1774
1775         size = vma->vm_end - vma->vm_start;
1776
1777         lock_sock(sk);
1778         if (po->pg_vec == NULL)
1779                 goto out;
1780         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1781                 goto out;
1782
1783         start = vma->vm_start;
1784         for (i = 0; i < po->pg_vec_len; i++) {
1785                 struct page *page = virt_to_page(po->pg_vec[i]);
1786                 int pg_num;
1787
1788                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1789                         err = vm_insert_page(vma, start, page);
1790                         if (unlikely(err))
1791                                 goto out;
1792                         start += PAGE_SIZE;
1793                 }
1794         }
1795         atomic_inc(&po->mapped);
1796         vma->vm_ops = &packet_mmap_ops;
1797         err = 0;
1798
1799 out:
1800         release_sock(sk);
1801         return err;
1802 }
1803 #endif
1804
1805
1806 static const struct proto_ops packet_ops_spkt = {
1807         .family =       PF_PACKET,
1808         .owner =        THIS_MODULE,
1809         .release =      packet_release,
1810         .bind =         packet_bind_spkt,
1811         .connect =      sock_no_connect,
1812         .socketpair =   sock_no_socketpair,
1813         .accept =       sock_no_accept,
1814         .getname =      packet_getname_spkt,
1815         .poll =         datagram_poll,
1816         .ioctl =        packet_ioctl,
1817         .listen =       sock_no_listen,
1818         .shutdown =     sock_no_shutdown,
1819         .setsockopt =   sock_no_setsockopt,
1820         .getsockopt =   sock_no_getsockopt,
1821         .sendmsg =      packet_sendmsg_spkt,
1822         .recvmsg =      packet_recvmsg,
1823         .mmap =         sock_no_mmap,
1824         .sendpage =     sock_no_sendpage,
1825 };
1826
1827 static const struct proto_ops packet_ops = {
1828         .family =       PF_PACKET,
1829         .owner =        THIS_MODULE,
1830         .release =      packet_release,
1831         .bind =         packet_bind,
1832         .connect =      sock_no_connect,
1833         .socketpair =   sock_no_socketpair,
1834         .accept =       sock_no_accept,
1835         .getname =      packet_getname,
1836         .poll =         packet_poll,
1837         .ioctl =        packet_ioctl,
1838         .listen =       sock_no_listen,
1839         .shutdown =     sock_no_shutdown,
1840         .setsockopt =   packet_setsockopt,
1841         .getsockopt =   packet_getsockopt,
1842         .sendmsg =      packet_sendmsg,
1843         .recvmsg =      packet_recvmsg,
1844         .mmap =         packet_mmap,
1845         .sendpage =     sock_no_sendpage,
1846 };
1847
1848 static struct net_proto_family packet_family_ops = {
1849         .family =       PF_PACKET,
1850         .create =       packet_create,
1851         .owner  =       THIS_MODULE,
1852 };
1853
1854 static struct notifier_block packet_netdev_notifier = {
1855         .notifier_call =packet_notifier,
1856 };
1857
1858 #ifdef CONFIG_PROC_FS
1859 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
1860 {
1861         struct sock *s;
1862         struct hlist_node *node;
1863
1864         sk_for_each(s, node, &net->packet.sklist) {
1865                 if (!off--)
1866                         return s;
1867         }
1868         return NULL;
1869 }
1870
1871 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1872         __acquires(seq_file_net(seq)->packet.sklist_lock)
1873 {
1874         struct net *net = seq_file_net(seq);
1875         read_lock(&net->packet.sklist_lock);
1876         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
1877 }
1878
1879 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1880 {
1881         struct net *net = seq_file_net(seq);
1882         ++*pos;
1883         return  (v == SEQ_START_TOKEN)
1884                 ? sk_head(&net->packet.sklist)
1885                 : sk_next((struct sock*)v) ;
1886 }
1887
1888 static void packet_seq_stop(struct seq_file *seq, void *v)
1889         __releases(seq_file_net(seq)->packet.sklist_lock)
1890 {
1891         struct net *net = seq_file_net(seq);
1892         read_unlock(&net->packet.sklist_lock);
1893 }
1894
1895 static int packet_seq_show(struct seq_file *seq, void *v)
1896 {
1897         if (v == SEQ_START_TOKEN)
1898                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1899         else {
1900                 struct sock *s = v;
1901                 const struct packet_sock *po = pkt_sk(s);
1902
1903                 seq_printf(seq,
1904                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1905                            s,
1906                            atomic_read(&s->sk_refcnt),
1907                            s->sk_type,
1908                            ntohs(po->num),
1909                            po->ifindex,
1910                            po->running,
1911                            atomic_read(&s->sk_rmem_alloc),
1912                            sock_i_uid(s),
1913                            sock_i_ino(s) );
1914         }
1915
1916         return 0;
1917 }
1918
1919 static const struct seq_operations packet_seq_ops = {
1920         .start  = packet_seq_start,
1921         .next   = packet_seq_next,
1922         .stop   = packet_seq_stop,
1923         .show   = packet_seq_show,
1924 };
1925
1926 static int packet_seq_open(struct inode *inode, struct file *file)
1927 {
1928         return seq_open_net(inode, file, &packet_seq_ops,
1929                             sizeof(struct seq_net_private));
1930 }
1931
1932 static const struct file_operations packet_seq_fops = {
1933         .owner          = THIS_MODULE,
1934         .open           = packet_seq_open,
1935         .read           = seq_read,
1936         .llseek         = seq_lseek,
1937         .release        = seq_release_net,
1938 };
1939
1940 #endif
1941
1942 static int packet_net_init(struct net *net)
1943 {
1944         rwlock_init(&net->packet.sklist_lock);
1945         INIT_HLIST_HEAD(&net->packet.sklist);
1946
1947         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
1948                 return -ENOMEM;
1949
1950         return 0;
1951 }
1952
1953 static void packet_net_exit(struct net *net)
1954 {
1955         proc_net_remove(net, "packet");
1956 }
1957
1958 static struct pernet_operations packet_net_ops = {
1959         .init = packet_net_init,
1960         .exit = packet_net_exit,
1961 };
1962
1963
1964 static void __exit packet_exit(void)
1965 {
1966         unregister_netdevice_notifier(&packet_netdev_notifier);
1967         unregister_pernet_subsys(&packet_net_ops);
1968         sock_unregister(PF_PACKET);
1969         proto_unregister(&packet_proto);
1970 }
1971
1972 static int __init packet_init(void)
1973 {
1974         int rc = proto_register(&packet_proto, 0);
1975
1976         if (rc != 0)
1977                 goto out;
1978
1979         sock_register(&packet_family_ops);
1980         register_pernet_subsys(&packet_net_ops);
1981         register_netdevice_notifier(&packet_netdev_notifier);
1982 out:
1983         return rc;
1984 }
1985
1986 module_init(packet_init);
1987 module_exit(packet_exit);
1988 MODULE_LICENSE("GPL");
1989 MODULE_ALIAS_NETPROTO(PF_PACKET);