Pull osi-now into release branch
[linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85
86 /*
87    Assumptions:
88    - if device has no dev->hard_header routine, it adds and removes ll header
89      inside itself. In this case ll header is invisible outside of device,
90      but higher levels still should reserve dev->hard_header_len.
91      Some devices are enough clever to reallocate skb, when header
92      will not fit to reserved space (tunnel), another ones are silly
93      (PPP).
94    - packet socket receives packets with pulled ll header,
95      so that SOCK_RAW should push it back.
96
97 On receive:
98 -----------
99
100 Incoming, dev->hard_header!=NULL
101    mac_header -> ll header
102    data       -> data
103
104 Outgoing, dev->hard_header!=NULL
105    mac_header -> ll header
106    data       -> ll header
107
108 Incoming, dev->hard_header==NULL
109    mac_header -> UNKNOWN position. It is very likely, that it points to ll
110                  header.  PPP makes it, that is wrong, because introduce
111                  assymetry between rx and tx paths.
112    data       -> data
113
114 Outgoing, dev->hard_header==NULL
115    mac_header -> data. ll header is still not built!
116    data       -> data
117
118 Resume
119   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
120
121
122 On transmit:
123 ------------
124
125 dev->hard_header != NULL
126    mac_header -> ll header
127    data       -> ll header
128
129 dev->hard_header == NULL (ll header is added by device, we cannot control it)
130    mac_header -> data
131    data       -> data
132
133    We should set nh.raw on output to correct posistion,
134    packet classifier depends on it.
135  */
136
137 /* List of all packet sockets. */
138 static HLIST_HEAD(packet_sklist);
139 static DEFINE_RWLOCK(packet_sklist_lock);
140
141 static atomic_t packet_socks_nr;
142
143
144 /* Private packet socket structures. */
145
146 struct packet_mclist
147 {
148         struct packet_mclist    *next;
149         int                     ifindex;
150         int                     count;
151         unsigned short          type;
152         unsigned short          alen;
153         unsigned char           addr[MAX_ADDR_LEN];
154 };
155 /* identical to struct packet_mreq except it has
156  * a longer address field.
157  */
158 struct packet_mreq_max
159 {
160         int             mr_ifindex;
161         unsigned short  mr_type;
162         unsigned short  mr_alen;
163         unsigned char   mr_address[MAX_ADDR_LEN];
164 };
165
166 #ifdef CONFIG_PACKET_MMAP
167 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
168 #endif
169
170 static void packet_flush_mclist(struct sock *sk);
171
172 struct packet_sock {
173         /* struct sock has to be the first member of packet_sock */
174         struct sock             sk;
175         struct tpacket_stats    stats;
176 #ifdef CONFIG_PACKET_MMAP
177         char *                  *pg_vec;
178         unsigned int            head;
179         unsigned int            frames_per_block;
180         unsigned int            frame_size;
181         unsigned int            frame_max;
182         int                     copy_thresh;
183 #endif
184         struct packet_type      prot_hook;
185         spinlock_t              bind_lock;
186         unsigned int            running:1,      /* prot_hook is attached*/
187                                 auxdata:1,
188                                 origdev:1;
189         int                     ifindex;        /* bound device         */
190         __be16                  num;
191         struct packet_mclist    *mclist;
192 #ifdef CONFIG_PACKET_MMAP
193         atomic_t                mapped;
194         unsigned int            pg_vec_order;
195         unsigned int            pg_vec_pages;
196         unsigned int            pg_vec_len;
197 #endif
198 };
199
200 struct packet_skb_cb {
201         unsigned int origlen;
202         union {
203                 struct sockaddr_pkt pkt;
204                 struct sockaddr_ll ll;
205         } sa;
206 };
207
208 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
209
210 #ifdef CONFIG_PACKET_MMAP
211
212 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
213 {
214         unsigned int pg_vec_pos, frame_offset;
215
216         pg_vec_pos = position / po->frames_per_block;
217         frame_offset = position % po->frames_per_block;
218
219         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
220 }
221 #endif
222
223 static inline struct packet_sock *pkt_sk(struct sock *sk)
224 {
225         return (struct packet_sock *)sk;
226 }
227
228 static void packet_sock_destruct(struct sock *sk)
229 {
230         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
231         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
232
233         if (!sock_flag(sk, SOCK_DEAD)) {
234                 printk("Attempt to release alive packet socket: %p\n", sk);
235                 return;
236         }
237
238         atomic_dec(&packet_socks_nr);
239 #ifdef PACKET_REFCNT_DEBUG
240         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
241 #endif
242 }
243
244
245 static const struct proto_ops packet_ops;
246
247 static const struct proto_ops packet_ops_spkt;
248
249 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
250 {
251         struct sock *sk;
252         struct sockaddr_pkt *spkt;
253
254         /*
255          *      When we registered the protocol we saved the socket in the data
256          *      field for just this event.
257          */
258
259         sk = pt->af_packet_priv;
260
261         /*
262          *      Yank back the headers [hope the device set this
263          *      right or kerboom...]
264          *
265          *      Incoming packets have ll header pulled,
266          *      push it back.
267          *
268          *      For outgoing ones skb->data == skb_mac_header(skb)
269          *      so that this procedure is noop.
270          */
271
272         if (skb->pkt_type == PACKET_LOOPBACK)
273                 goto out;
274
275         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
276                 goto oom;
277
278         /* drop any routing info */
279         dst_release(skb->dst);
280         skb->dst = NULL;
281
282         /* drop conntrack reference */
283         nf_reset(skb);
284
285         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
286
287         skb_push(skb, skb->data - skb_mac_header(skb));
288
289         /*
290          *      The SOCK_PACKET socket receives _all_ frames.
291          */
292
293         spkt->spkt_family = dev->type;
294         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
295         spkt->spkt_protocol = skb->protocol;
296
297         /*
298          *      Charge the memory to the socket. This is done specifically
299          *      to prevent sockets using all the memory up.
300          */
301
302         if (sock_queue_rcv_skb(sk,skb) == 0)
303                 return 0;
304
305 out:
306         kfree_skb(skb);
307 oom:
308         return 0;
309 }
310
311
312 /*
313  *      Output a raw packet to a device layer. This bypasses all the other
314  *      protocol layers and you must therefore supply it with a complete frame
315  */
316
317 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
318                                struct msghdr *msg, size_t len)
319 {
320         struct sock *sk = sock->sk;
321         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
322         struct sk_buff *skb;
323         struct net_device *dev;
324         __be16 proto=0;
325         int err;
326
327         /*
328          *      Get and verify the address.
329          */
330
331         if (saddr)
332         {
333                 if (msg->msg_namelen < sizeof(struct sockaddr))
334                         return(-EINVAL);
335                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
336                         proto=saddr->spkt_protocol;
337         }
338         else
339                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
340
341         /*
342          *      Find the device first to size check it
343          */
344
345         saddr->spkt_device[13] = 0;
346         dev = dev_get_by_name(saddr->spkt_device);
347         err = -ENODEV;
348         if (dev == NULL)
349                 goto out_unlock;
350
351         err = -ENETDOWN;
352         if (!(dev->flags & IFF_UP))
353                 goto out_unlock;
354
355         /*
356          *      You may not queue a frame bigger than the mtu. This is the lowest level
357          *      raw protocol and you must do your own fragmentation at this level.
358          */
359
360         err = -EMSGSIZE;
361         if (len > dev->mtu + dev->hard_header_len)
362                 goto out_unlock;
363
364         err = -ENOBUFS;
365         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
366
367         /*
368          *      If the write buffer is full, then tough. At this level the user gets to
369          *      deal with the problem - do your own algorithmic backoffs. That's far
370          *      more flexible.
371          */
372
373         if (skb == NULL)
374                 goto out_unlock;
375
376         /*
377          *      Fill it in
378          */
379
380         /* FIXME: Save some space for broken drivers that write a
381          * hard header at transmission time by themselves. PPP is the
382          * notable one here. This should really be fixed at the driver level.
383          */
384         skb_reserve(skb, LL_RESERVED_SPACE(dev));
385         skb_reset_network_header(skb);
386
387         /* Try to align data part correctly */
388         if (dev->hard_header) {
389                 skb->data -= dev->hard_header_len;
390                 skb->tail -= dev->hard_header_len;
391                 if (len < dev->hard_header_len)
392                         skb_reset_network_header(skb);
393         }
394
395         /* Returns -EFAULT on error */
396         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
397         skb->protocol = proto;
398         skb->dev = dev;
399         skb->priority = sk->sk_priority;
400         if (err)
401                 goto out_free;
402
403         /*
404          *      Now send it
405          */
406
407         dev_queue_xmit(skb);
408         dev_put(dev);
409         return(len);
410
411 out_free:
412         kfree_skb(skb);
413 out_unlock:
414         if (dev)
415                 dev_put(dev);
416         return err;
417 }
418
419 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
420                                       unsigned int res)
421 {
422         struct sk_filter *filter;
423
424         rcu_read_lock_bh();
425         filter = rcu_dereference(sk->sk_filter);
426         if (filter != NULL)
427                 res = sk_run_filter(skb, filter->insns, filter->len);
428         rcu_read_unlock_bh();
429
430         return res;
431 }
432
433 /*
434    This function makes lazy skb cloning in hope that most of packets
435    are discarded by BPF.
436
437    Note tricky part: we DO mangle shared skb! skb->data, skb->len
438    and skb->cb are mangled. It works because (and until) packets
439    falling here are owned by current CPU. Output packets are cloned
440    by dev_queue_xmit_nit(), input packets are processed by net_bh
441    sequencially, so that if we return skb to original state on exit,
442    we will not harm anyone.
443  */
444
445 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
446 {
447         struct sock *sk;
448         struct sockaddr_ll *sll;
449         struct packet_sock *po;
450         u8 * skb_head = skb->data;
451         int skb_len = skb->len;
452         unsigned int snaplen, res;
453
454         if (skb->pkt_type == PACKET_LOOPBACK)
455                 goto drop;
456
457         sk = pt->af_packet_priv;
458         po = pkt_sk(sk);
459
460         skb->dev = dev;
461
462         if (dev->hard_header) {
463                 /* The device has an explicit notion of ll header,
464                    exported to higher levels.
465
466                    Otherwise, the device hides datails of it frame
467                    structure, so that corresponding packet head
468                    never delivered to user.
469                  */
470                 if (sk->sk_type != SOCK_DGRAM)
471                         skb_push(skb, skb->data - skb_mac_header(skb));
472                 else if (skb->pkt_type == PACKET_OUTGOING) {
473                         /* Special case: outgoing packets have ll header at head */
474                         skb_pull(skb, skb_network_offset(skb));
475                 }
476         }
477
478         snaplen = skb->len;
479
480         res = run_filter(skb, sk, snaplen);
481         if (!res)
482                 goto drop_n_restore;
483         if (snaplen > res)
484                 snaplen = res;
485
486         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
487             (unsigned)sk->sk_rcvbuf)
488                 goto drop_n_acct;
489
490         if (skb_shared(skb)) {
491                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
492                 if (nskb == NULL)
493                         goto drop_n_acct;
494
495                 if (skb_head != skb->data) {
496                         skb->data = skb_head;
497                         skb->len = skb_len;
498                 }
499                 kfree_skb(skb);
500                 skb = nskb;
501         }
502
503         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
504                      sizeof(skb->cb));
505
506         sll = &PACKET_SKB_CB(skb)->sa.ll;
507         sll->sll_family = AF_PACKET;
508         sll->sll_hatype = dev->type;
509         sll->sll_protocol = skb->protocol;
510         sll->sll_pkttype = skb->pkt_type;
511         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
512                 sll->sll_ifindex = orig_dev->ifindex;
513         else
514                 sll->sll_ifindex = dev->ifindex;
515         sll->sll_halen = 0;
516
517         if (dev->hard_header_parse)
518                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
519
520         PACKET_SKB_CB(skb)->origlen = skb->len;
521
522         if (pskb_trim(skb, snaplen))
523                 goto drop_n_acct;
524
525         skb_set_owner_r(skb, sk);
526         skb->dev = NULL;
527         dst_release(skb->dst);
528         skb->dst = NULL;
529
530         /* drop conntrack reference */
531         nf_reset(skb);
532
533         spin_lock(&sk->sk_receive_queue.lock);
534         po->stats.tp_packets++;
535         __skb_queue_tail(&sk->sk_receive_queue, skb);
536         spin_unlock(&sk->sk_receive_queue.lock);
537         sk->sk_data_ready(sk, skb->len);
538         return 0;
539
540 drop_n_acct:
541         spin_lock(&sk->sk_receive_queue.lock);
542         po->stats.tp_drops++;
543         spin_unlock(&sk->sk_receive_queue.lock);
544
545 drop_n_restore:
546         if (skb_head != skb->data && skb_shared(skb)) {
547                 skb->data = skb_head;
548                 skb->len = skb_len;
549         }
550 drop:
551         kfree_skb(skb);
552         return 0;
553 }
554
555 #ifdef CONFIG_PACKET_MMAP
556 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
557 {
558         struct sock *sk;
559         struct packet_sock *po;
560         struct sockaddr_ll *sll;
561         struct tpacket_hdr *h;
562         u8 * skb_head = skb->data;
563         int skb_len = skb->len;
564         unsigned int snaplen, res;
565         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
566         unsigned short macoff, netoff;
567         struct sk_buff *copy_skb = NULL;
568         struct timeval tv;
569
570         if (skb->pkt_type == PACKET_LOOPBACK)
571                 goto drop;
572
573         sk = pt->af_packet_priv;
574         po = pkt_sk(sk);
575
576         if (dev->hard_header) {
577                 if (sk->sk_type != SOCK_DGRAM)
578                         skb_push(skb, skb->data - skb_mac_header(skb));
579                 else if (skb->pkt_type == PACKET_OUTGOING) {
580                         /* Special case: outgoing packets have ll header at head */
581                         skb_pull(skb, skb_network_offset(skb));
582                 }
583         }
584
585         if (skb->ip_summed == CHECKSUM_PARTIAL)
586                 status |= TP_STATUS_CSUMNOTREADY;
587
588         snaplen = skb->len;
589
590         res = run_filter(skb, sk, snaplen);
591         if (!res)
592                 goto drop_n_restore;
593         if (snaplen > res)
594                 snaplen = res;
595
596         if (sk->sk_type == SOCK_DGRAM) {
597                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
598         } else {
599                 unsigned maclen = skb_network_offset(skb);
600                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
601                 macoff = netoff - maclen;
602         }
603
604         if (macoff + snaplen > po->frame_size) {
605                 if (po->copy_thresh &&
606                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
607                     (unsigned)sk->sk_rcvbuf) {
608                         if (skb_shared(skb)) {
609                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
610                         } else {
611                                 copy_skb = skb_get(skb);
612                                 skb_head = skb->data;
613                         }
614                         if (copy_skb)
615                                 skb_set_owner_r(copy_skb, sk);
616                 }
617                 snaplen = po->frame_size - macoff;
618                 if ((int)snaplen < 0)
619                         snaplen = 0;
620         }
621
622         spin_lock(&sk->sk_receive_queue.lock);
623         h = packet_lookup_frame(po, po->head);
624
625         if (h->tp_status)
626                 goto ring_is_full;
627         po->head = po->head != po->frame_max ? po->head+1 : 0;
628         po->stats.tp_packets++;
629         if (copy_skb) {
630                 status |= TP_STATUS_COPY;
631                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
632         }
633         if (!po->stats.tp_drops)
634                 status &= ~TP_STATUS_LOSING;
635         spin_unlock(&sk->sk_receive_queue.lock);
636
637         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
638
639         h->tp_len = skb->len;
640         h->tp_snaplen = snaplen;
641         h->tp_mac = macoff;
642         h->tp_net = netoff;
643         if (skb->tstamp.tv64 == 0) {
644                 __net_timestamp(skb);
645                 sock_enable_timestamp(sk);
646         }
647         tv = ktime_to_timeval(skb->tstamp);
648         h->tp_sec = tv.tv_sec;
649         h->tp_usec = tv.tv_usec;
650
651         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
652         sll->sll_halen = 0;
653         if (dev->hard_header_parse)
654                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
655         sll->sll_family = AF_PACKET;
656         sll->sll_hatype = dev->type;
657         sll->sll_protocol = skb->protocol;
658         sll->sll_pkttype = skb->pkt_type;
659         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
660                 sll->sll_ifindex = orig_dev->ifindex;
661         else
662                 sll->sll_ifindex = dev->ifindex;
663
664         h->tp_status = status;
665         smp_mb();
666
667         {
668                 struct page *p_start, *p_end;
669                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
670
671                 p_start = virt_to_page(h);
672                 p_end = virt_to_page(h_end);
673                 while (p_start <= p_end) {
674                         flush_dcache_page(p_start);
675                         p_start++;
676                 }
677         }
678
679         sk->sk_data_ready(sk, 0);
680
681 drop_n_restore:
682         if (skb_head != skb->data && skb_shared(skb)) {
683                 skb->data = skb_head;
684                 skb->len = skb_len;
685         }
686 drop:
687         kfree_skb(skb);
688         return 0;
689
690 ring_is_full:
691         po->stats.tp_drops++;
692         spin_unlock(&sk->sk_receive_queue.lock);
693
694         sk->sk_data_ready(sk, 0);
695         if (copy_skb)
696                 kfree_skb(copy_skb);
697         goto drop_n_restore;
698 }
699
700 #endif
701
702
703 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
704                           struct msghdr *msg, size_t len)
705 {
706         struct sock *sk = sock->sk;
707         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
708         struct sk_buff *skb;
709         struct net_device *dev;
710         __be16 proto;
711         unsigned char *addr;
712         int ifindex, err, reserve = 0;
713
714         /*
715          *      Get and verify the address.
716          */
717
718         if (saddr == NULL) {
719                 struct packet_sock *po = pkt_sk(sk);
720
721                 ifindex = po->ifindex;
722                 proto   = po->num;
723                 addr    = NULL;
724         } else {
725                 err = -EINVAL;
726                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
727                         goto out;
728                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
729                         goto out;
730                 ifindex = saddr->sll_ifindex;
731                 proto   = saddr->sll_protocol;
732                 addr    = saddr->sll_addr;
733         }
734
735
736         dev = dev_get_by_index(ifindex);
737         err = -ENXIO;
738         if (dev == NULL)
739                 goto out_unlock;
740         if (sock->type == SOCK_RAW)
741                 reserve = dev->hard_header_len;
742
743         err = -ENETDOWN;
744         if (!(dev->flags & IFF_UP))
745                 goto out_unlock;
746
747         err = -EMSGSIZE;
748         if (len > dev->mtu+reserve)
749                 goto out_unlock;
750
751         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
752                                 msg->msg_flags & MSG_DONTWAIT, &err);
753         if (skb==NULL)
754                 goto out_unlock;
755
756         skb_reserve(skb, LL_RESERVED_SPACE(dev));
757         skb_reset_network_header(skb);
758
759         if (dev->hard_header) {
760                 int res;
761                 err = -EINVAL;
762                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
763                 if (sock->type != SOCK_DGRAM) {
764                         skb_reset_tail_pointer(skb);
765                         skb->len = 0;
766                 } else if (res < 0)
767                         goto out_free;
768         }
769
770         /* Returns -EFAULT on error */
771         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
772         if (err)
773                 goto out_free;
774
775         skb->protocol = proto;
776         skb->dev = dev;
777         skb->priority = sk->sk_priority;
778
779         /*
780          *      Now send it
781          */
782
783         err = dev_queue_xmit(skb);
784         if (err > 0 && (err = net_xmit_errno(err)) != 0)
785                 goto out_unlock;
786
787         dev_put(dev);
788
789         return(len);
790
791 out_free:
792         kfree_skb(skb);
793 out_unlock:
794         if (dev)
795                 dev_put(dev);
796 out:
797         return err;
798 }
799
800 /*
801  *      Close a PACKET socket. This is fairly simple. We immediately go
802  *      to 'closed' state and remove our protocol entry in the device list.
803  */
804
805 static int packet_release(struct socket *sock)
806 {
807         struct sock *sk = sock->sk;
808         struct packet_sock *po;
809
810         if (!sk)
811                 return 0;
812
813         po = pkt_sk(sk);
814
815         write_lock_bh(&packet_sklist_lock);
816         sk_del_node_init(sk);
817         write_unlock_bh(&packet_sklist_lock);
818
819         /*
820          *      Unhook packet receive handler.
821          */
822
823         if (po->running) {
824                 /*
825                  *      Remove the protocol hook
826                  */
827                 dev_remove_pack(&po->prot_hook);
828                 po->running = 0;
829                 po->num = 0;
830                 __sock_put(sk);
831         }
832
833         packet_flush_mclist(sk);
834
835 #ifdef CONFIG_PACKET_MMAP
836         if (po->pg_vec) {
837                 struct tpacket_req req;
838                 memset(&req, 0, sizeof(req));
839                 packet_set_ring(sk, &req, 1);
840         }
841 #endif
842
843         /*
844          *      Now the socket is dead. No more input will appear.
845          */
846
847         sock_orphan(sk);
848         sock->sk = NULL;
849
850         /* Purge queues */
851
852         skb_queue_purge(&sk->sk_receive_queue);
853
854         sock_put(sk);
855         return 0;
856 }
857
858 /*
859  *      Attach a packet hook.
860  */
861
862 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
863 {
864         struct packet_sock *po = pkt_sk(sk);
865         /*
866          *      Detach an existing hook if present.
867          */
868
869         lock_sock(sk);
870
871         spin_lock(&po->bind_lock);
872         if (po->running) {
873                 __sock_put(sk);
874                 po->running = 0;
875                 po->num = 0;
876                 spin_unlock(&po->bind_lock);
877                 dev_remove_pack(&po->prot_hook);
878                 spin_lock(&po->bind_lock);
879         }
880
881         po->num = protocol;
882         po->prot_hook.type = protocol;
883         po->prot_hook.dev = dev;
884
885         po->ifindex = dev ? dev->ifindex : 0;
886
887         if (protocol == 0)
888                 goto out_unlock;
889
890         if (dev) {
891                 if (dev->flags&IFF_UP) {
892                         dev_add_pack(&po->prot_hook);
893                         sock_hold(sk);
894                         po->running = 1;
895                 } else {
896                         sk->sk_err = ENETDOWN;
897                         if (!sock_flag(sk, SOCK_DEAD))
898                                 sk->sk_error_report(sk);
899                 }
900         } else {
901                 dev_add_pack(&po->prot_hook);
902                 sock_hold(sk);
903                 po->running = 1;
904         }
905
906 out_unlock:
907         spin_unlock(&po->bind_lock);
908         release_sock(sk);
909         return 0;
910 }
911
912 /*
913  *      Bind a packet socket to a device
914  */
915
916 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
917 {
918         struct sock *sk=sock->sk;
919         char name[15];
920         struct net_device *dev;
921         int err = -ENODEV;
922
923         /*
924          *      Check legality
925          */
926
927         if (addr_len != sizeof(struct sockaddr))
928                 return -EINVAL;
929         strlcpy(name,uaddr->sa_data,sizeof(name));
930
931         dev = dev_get_by_name(name);
932         if (dev) {
933                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
934                 dev_put(dev);
935         }
936         return err;
937 }
938
939 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
940 {
941         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
942         struct sock *sk=sock->sk;
943         struct net_device *dev = NULL;
944         int err;
945
946
947         /*
948          *      Check legality
949          */
950
951         if (addr_len < sizeof(struct sockaddr_ll))
952                 return -EINVAL;
953         if (sll->sll_family != AF_PACKET)
954                 return -EINVAL;
955
956         if (sll->sll_ifindex) {
957                 err = -ENODEV;
958                 dev = dev_get_by_index(sll->sll_ifindex);
959                 if (dev == NULL)
960                         goto out;
961         }
962         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
963         if (dev)
964                 dev_put(dev);
965
966 out:
967         return err;
968 }
969
970 static struct proto packet_proto = {
971         .name     = "PACKET",
972         .owner    = THIS_MODULE,
973         .obj_size = sizeof(struct packet_sock),
974 };
975
976 /*
977  *      Create a packet of type SOCK_PACKET.
978  */
979
980 static int packet_create(struct socket *sock, int protocol)
981 {
982         struct sock *sk;
983         struct packet_sock *po;
984         __be16 proto = (__force __be16)protocol; /* weird, but documented */
985         int err;
986
987         if (!capable(CAP_NET_RAW))
988                 return -EPERM;
989         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
990             sock->type != SOCK_PACKET)
991                 return -ESOCKTNOSUPPORT;
992
993         sock->state = SS_UNCONNECTED;
994
995         err = -ENOBUFS;
996         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
997         if (sk == NULL)
998                 goto out;
999
1000         sock->ops = &packet_ops;
1001         if (sock->type == SOCK_PACKET)
1002                 sock->ops = &packet_ops_spkt;
1003
1004         sock_init_data(sock, sk);
1005
1006         po = pkt_sk(sk);
1007         sk->sk_family = PF_PACKET;
1008         po->num = proto;
1009
1010         sk->sk_destruct = packet_sock_destruct;
1011         atomic_inc(&packet_socks_nr);
1012
1013         /*
1014          *      Attach a protocol block
1015          */
1016
1017         spin_lock_init(&po->bind_lock);
1018         po->prot_hook.func = packet_rcv;
1019
1020         if (sock->type == SOCK_PACKET)
1021                 po->prot_hook.func = packet_rcv_spkt;
1022
1023         po->prot_hook.af_packet_priv = sk;
1024
1025         if (proto) {
1026                 po->prot_hook.type = proto;
1027                 dev_add_pack(&po->prot_hook);
1028                 sock_hold(sk);
1029                 po->running = 1;
1030         }
1031
1032         write_lock_bh(&packet_sklist_lock);
1033         sk_add_node(sk, &packet_sklist);
1034         write_unlock_bh(&packet_sklist_lock);
1035         return(0);
1036 out:
1037         return err;
1038 }
1039
1040 /*
1041  *      Pull a packet from our receive queue and hand it to the user.
1042  *      If necessary we block.
1043  */
1044
1045 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1046                           struct msghdr *msg, size_t len, int flags)
1047 {
1048         struct sock *sk = sock->sk;
1049         struct sk_buff *skb;
1050         int copied, err;
1051         struct sockaddr_ll *sll;
1052
1053         err = -EINVAL;
1054         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1055                 goto out;
1056
1057 #if 0
1058         /* What error should we return now? EUNATTACH? */
1059         if (pkt_sk(sk)->ifindex < 0)
1060                 return -ENODEV;
1061 #endif
1062
1063         /*
1064          *      Call the generic datagram receiver. This handles all sorts
1065          *      of horrible races and re-entrancy so we can forget about it
1066          *      in the protocol layers.
1067          *
1068          *      Now it will return ENETDOWN, if device have just gone down,
1069          *      but then it will block.
1070          */
1071
1072         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1073
1074         /*
1075          *      An error occurred so return it. Because skb_recv_datagram()
1076          *      handles the blocking we don't see and worry about blocking
1077          *      retries.
1078          */
1079
1080         if (skb == NULL)
1081                 goto out;
1082
1083         /*
1084          *      If the address length field is there to be filled in, we fill
1085          *      it in now.
1086          */
1087
1088         sll = &PACKET_SKB_CB(skb)->sa.ll;
1089         if (sock->type == SOCK_PACKET)
1090                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1091         else
1092                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1093
1094         /*
1095          *      You lose any data beyond the buffer you gave. If it worries a
1096          *      user program they can ask the device for its MTU anyway.
1097          */
1098
1099         copied = skb->len;
1100         if (copied > len)
1101         {
1102                 copied=len;
1103                 msg->msg_flags|=MSG_TRUNC;
1104         }
1105
1106         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1107         if (err)
1108                 goto out_free;
1109
1110         sock_recv_timestamp(msg, sk, skb);
1111
1112         if (msg->msg_name)
1113                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1114                        msg->msg_namelen);
1115
1116         if (pkt_sk(sk)->auxdata) {
1117                 struct tpacket_auxdata aux;
1118
1119                 aux.tp_status = TP_STATUS_USER;
1120                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1121                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1122                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1123                 aux.tp_snaplen = skb->len;
1124                 aux.tp_mac = 0;
1125                 aux.tp_net = skb_network_offset(skb);
1126
1127                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1128         }
1129
1130         /*
1131          *      Free or return the buffer as appropriate. Again this
1132          *      hides all the races and re-entrancy issues from us.
1133          */
1134         err = (flags&MSG_TRUNC) ? skb->len : copied;
1135
1136 out_free:
1137         skb_free_datagram(sk, skb);
1138 out:
1139         return err;
1140 }
1141
1142 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1143                                int *uaddr_len, int peer)
1144 {
1145         struct net_device *dev;
1146         struct sock *sk = sock->sk;
1147
1148         if (peer)
1149                 return -EOPNOTSUPP;
1150
1151         uaddr->sa_family = AF_PACKET;
1152         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1153         if (dev) {
1154                 strlcpy(uaddr->sa_data, dev->name, 15);
1155                 dev_put(dev);
1156         } else
1157                 memset(uaddr->sa_data, 0, 14);
1158         *uaddr_len = sizeof(*uaddr);
1159
1160         return 0;
1161 }
1162
1163 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1164                           int *uaddr_len, int peer)
1165 {
1166         struct net_device *dev;
1167         struct sock *sk = sock->sk;
1168         struct packet_sock *po = pkt_sk(sk);
1169         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1170
1171         if (peer)
1172                 return -EOPNOTSUPP;
1173
1174         sll->sll_family = AF_PACKET;
1175         sll->sll_ifindex = po->ifindex;
1176         sll->sll_protocol = po->num;
1177         dev = dev_get_by_index(po->ifindex);
1178         if (dev) {
1179                 sll->sll_hatype = dev->type;
1180                 sll->sll_halen = dev->addr_len;
1181                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1182                 dev_put(dev);
1183         } else {
1184                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1185                 sll->sll_halen = 0;
1186         }
1187         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1188
1189         return 0;
1190 }
1191
1192 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1193 {
1194         switch (i->type) {
1195         case PACKET_MR_MULTICAST:
1196                 if (what > 0)
1197                         dev_mc_add(dev, i->addr, i->alen, 0);
1198                 else
1199                         dev_mc_delete(dev, i->addr, i->alen, 0);
1200                 break;
1201         case PACKET_MR_PROMISC:
1202                 dev_set_promiscuity(dev, what);
1203                 break;
1204         case PACKET_MR_ALLMULTI:
1205                 dev_set_allmulti(dev, what);
1206                 break;
1207         default:;
1208         }
1209 }
1210
1211 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1212 {
1213         for ( ; i; i=i->next) {
1214                 if (i->ifindex == dev->ifindex)
1215                         packet_dev_mc(dev, i, what);
1216         }
1217 }
1218
1219 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1220 {
1221         struct packet_sock *po = pkt_sk(sk);
1222         struct packet_mclist *ml, *i;
1223         struct net_device *dev;
1224         int err;
1225
1226         rtnl_lock();
1227
1228         err = -ENODEV;
1229         dev = __dev_get_by_index(mreq->mr_ifindex);
1230         if (!dev)
1231                 goto done;
1232
1233         err = -EINVAL;
1234         if (mreq->mr_alen > dev->addr_len)
1235                 goto done;
1236
1237         err = -ENOBUFS;
1238         i = kmalloc(sizeof(*i), GFP_KERNEL);
1239         if (i == NULL)
1240                 goto done;
1241
1242         err = 0;
1243         for (ml = po->mclist; ml; ml = ml->next) {
1244                 if (ml->ifindex == mreq->mr_ifindex &&
1245                     ml->type == mreq->mr_type &&
1246                     ml->alen == mreq->mr_alen &&
1247                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1248                         ml->count++;
1249                         /* Free the new element ... */
1250                         kfree(i);
1251                         goto done;
1252                 }
1253         }
1254
1255         i->type = mreq->mr_type;
1256         i->ifindex = mreq->mr_ifindex;
1257         i->alen = mreq->mr_alen;
1258         memcpy(i->addr, mreq->mr_address, i->alen);
1259         i->count = 1;
1260         i->next = po->mclist;
1261         po->mclist = i;
1262         packet_dev_mc(dev, i, +1);
1263
1264 done:
1265         rtnl_unlock();
1266         return err;
1267 }
1268
1269 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1270 {
1271         struct packet_mclist *ml, **mlp;
1272
1273         rtnl_lock();
1274
1275         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1276                 if (ml->ifindex == mreq->mr_ifindex &&
1277                     ml->type == mreq->mr_type &&
1278                     ml->alen == mreq->mr_alen &&
1279                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1280                         if (--ml->count == 0) {
1281                                 struct net_device *dev;
1282                                 *mlp = ml->next;
1283                                 dev = dev_get_by_index(ml->ifindex);
1284                                 if (dev) {
1285                                         packet_dev_mc(dev, ml, -1);
1286                                         dev_put(dev);
1287                                 }
1288                                 kfree(ml);
1289                         }
1290                         rtnl_unlock();
1291                         return 0;
1292                 }
1293         }
1294         rtnl_unlock();
1295         return -EADDRNOTAVAIL;
1296 }
1297
1298 static void packet_flush_mclist(struct sock *sk)
1299 {
1300         struct packet_sock *po = pkt_sk(sk);
1301         struct packet_mclist *ml;
1302
1303         if (!po->mclist)
1304                 return;
1305
1306         rtnl_lock();
1307         while ((ml = po->mclist) != NULL) {
1308                 struct net_device *dev;
1309
1310                 po->mclist = ml->next;
1311                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1312                         packet_dev_mc(dev, ml, -1);
1313                         dev_put(dev);
1314                 }
1315                 kfree(ml);
1316         }
1317         rtnl_unlock();
1318 }
1319
1320 static int
1321 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1322 {
1323         struct sock *sk = sock->sk;
1324         struct packet_sock *po = pkt_sk(sk);
1325         int ret;
1326
1327         if (level != SOL_PACKET)
1328                 return -ENOPROTOOPT;
1329
1330         switch(optname) {
1331         case PACKET_ADD_MEMBERSHIP:
1332         case PACKET_DROP_MEMBERSHIP:
1333         {
1334                 struct packet_mreq_max mreq;
1335                 int len = optlen;
1336                 memset(&mreq, 0, sizeof(mreq));
1337                 if (len < sizeof(struct packet_mreq))
1338                         return -EINVAL;
1339                 if (len > sizeof(mreq))
1340                         len = sizeof(mreq);
1341                 if (copy_from_user(&mreq,optval,len))
1342                         return -EFAULT;
1343                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1344                         return -EINVAL;
1345                 if (optname == PACKET_ADD_MEMBERSHIP)
1346                         ret = packet_mc_add(sk, &mreq);
1347                 else
1348                         ret = packet_mc_drop(sk, &mreq);
1349                 return ret;
1350         }
1351
1352 #ifdef CONFIG_PACKET_MMAP
1353         case PACKET_RX_RING:
1354         {
1355                 struct tpacket_req req;
1356
1357                 if (optlen<sizeof(req))
1358                         return -EINVAL;
1359                 if (copy_from_user(&req,optval,sizeof(req)))
1360                         return -EFAULT;
1361                 return packet_set_ring(sk, &req, 0);
1362         }
1363         case PACKET_COPY_THRESH:
1364         {
1365                 int val;
1366
1367                 if (optlen!=sizeof(val))
1368                         return -EINVAL;
1369                 if (copy_from_user(&val,optval,sizeof(val)))
1370                         return -EFAULT;
1371
1372                 pkt_sk(sk)->copy_thresh = val;
1373                 return 0;
1374         }
1375 #endif
1376         case PACKET_AUXDATA:
1377         {
1378                 int val;
1379
1380                 if (optlen < sizeof(val))
1381                         return -EINVAL;
1382                 if (copy_from_user(&val, optval, sizeof(val)))
1383                         return -EFAULT;
1384
1385                 po->auxdata = !!val;
1386                 return 0;
1387         }
1388         case PACKET_ORIGDEV:
1389         {
1390                 int val;
1391
1392                 if (optlen < sizeof(val))
1393                         return -EINVAL;
1394                 if (copy_from_user(&val, optval, sizeof(val)))
1395                         return -EFAULT;
1396
1397                 po->origdev = !!val;
1398                 return 0;
1399         }
1400         default:
1401                 return -ENOPROTOOPT;
1402         }
1403 }
1404
1405 static int packet_getsockopt(struct socket *sock, int level, int optname,
1406                              char __user *optval, int __user *optlen)
1407 {
1408         int len;
1409         int val;
1410         struct sock *sk = sock->sk;
1411         struct packet_sock *po = pkt_sk(sk);
1412         void *data;
1413         struct tpacket_stats st;
1414
1415         if (level != SOL_PACKET)
1416                 return -ENOPROTOOPT;
1417
1418         if (get_user(len, optlen))
1419                 return -EFAULT;
1420
1421         if (len < 0)
1422                 return -EINVAL;
1423
1424         switch(optname) {
1425         case PACKET_STATISTICS:
1426                 if (len > sizeof(struct tpacket_stats))
1427                         len = sizeof(struct tpacket_stats);
1428                 spin_lock_bh(&sk->sk_receive_queue.lock);
1429                 st = po->stats;
1430                 memset(&po->stats, 0, sizeof(st));
1431                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1432                 st.tp_packets += st.tp_drops;
1433
1434                 data = &st;
1435                 break;
1436         case PACKET_AUXDATA:
1437                 if (len > sizeof(int))
1438                         len = sizeof(int);
1439                 val = po->auxdata;
1440
1441                 data = &val;
1442                 break;
1443         case PACKET_ORIGDEV:
1444                 if (len > sizeof(int))
1445                         len = sizeof(int);
1446                 val = po->origdev;
1447
1448                 data = &val;
1449                 break;
1450         default:
1451                 return -ENOPROTOOPT;
1452         }
1453
1454         if (put_user(len, optlen))
1455                 return -EFAULT;
1456         if (copy_to_user(optval, data, len))
1457                 return -EFAULT;
1458         return 0;
1459 }
1460
1461
1462 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1463 {
1464         struct sock *sk;
1465         struct hlist_node *node;
1466         struct net_device *dev = data;
1467
1468         read_lock(&packet_sklist_lock);
1469         sk_for_each(sk, node, &packet_sklist) {
1470                 struct packet_sock *po = pkt_sk(sk);
1471
1472                 switch (msg) {
1473                 case NETDEV_UNREGISTER:
1474                         if (po->mclist)
1475                                 packet_dev_mclist(dev, po->mclist, -1);
1476                         /* fallthrough */
1477
1478                 case NETDEV_DOWN:
1479                         if (dev->ifindex == po->ifindex) {
1480                                 spin_lock(&po->bind_lock);
1481                                 if (po->running) {
1482                                         __dev_remove_pack(&po->prot_hook);
1483                                         __sock_put(sk);
1484                                         po->running = 0;
1485                                         sk->sk_err = ENETDOWN;
1486                                         if (!sock_flag(sk, SOCK_DEAD))
1487                                                 sk->sk_error_report(sk);
1488                                 }
1489                                 if (msg == NETDEV_UNREGISTER) {
1490                                         po->ifindex = -1;
1491                                         po->prot_hook.dev = NULL;
1492                                 }
1493                                 spin_unlock(&po->bind_lock);
1494                         }
1495                         break;
1496                 case NETDEV_UP:
1497                         spin_lock(&po->bind_lock);
1498                         if (dev->ifindex == po->ifindex && po->num &&
1499                             !po->running) {
1500                                 dev_add_pack(&po->prot_hook);
1501                                 sock_hold(sk);
1502                                 po->running = 1;
1503                         }
1504                         spin_unlock(&po->bind_lock);
1505                         break;
1506                 }
1507         }
1508         read_unlock(&packet_sklist_lock);
1509         return NOTIFY_DONE;
1510 }
1511
1512
1513 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1514                         unsigned long arg)
1515 {
1516         struct sock *sk = sock->sk;
1517
1518         switch(cmd) {
1519                 case SIOCOUTQ:
1520                 {
1521                         int amount = atomic_read(&sk->sk_wmem_alloc);
1522                         return put_user(amount, (int __user *)arg);
1523                 }
1524                 case SIOCINQ:
1525                 {
1526                         struct sk_buff *skb;
1527                         int amount = 0;
1528
1529                         spin_lock_bh(&sk->sk_receive_queue.lock);
1530                         skb = skb_peek(&sk->sk_receive_queue);
1531                         if (skb)
1532                                 amount = skb->len;
1533                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1534                         return put_user(amount, (int __user *)arg);
1535                 }
1536                 case SIOCGSTAMP:
1537                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1538                 case SIOCGSTAMPNS:
1539                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1540
1541 #ifdef CONFIG_INET
1542                 case SIOCADDRT:
1543                 case SIOCDELRT:
1544                 case SIOCDARP:
1545                 case SIOCGARP:
1546                 case SIOCSARP:
1547                 case SIOCGIFADDR:
1548                 case SIOCSIFADDR:
1549                 case SIOCGIFBRDADDR:
1550                 case SIOCSIFBRDADDR:
1551                 case SIOCGIFNETMASK:
1552                 case SIOCSIFNETMASK:
1553                 case SIOCGIFDSTADDR:
1554                 case SIOCSIFDSTADDR:
1555                 case SIOCSIFFLAGS:
1556                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1557 #endif
1558
1559                 default:
1560                         return -ENOIOCTLCMD;
1561         }
1562         return 0;
1563 }
1564
1565 #ifndef CONFIG_PACKET_MMAP
1566 #define packet_mmap sock_no_mmap
1567 #define packet_poll datagram_poll
1568 #else
1569
1570 static unsigned int packet_poll(struct file * file, struct socket *sock,
1571                                 poll_table *wait)
1572 {
1573         struct sock *sk = sock->sk;
1574         struct packet_sock *po = pkt_sk(sk);
1575         unsigned int mask = datagram_poll(file, sock, wait);
1576
1577         spin_lock_bh(&sk->sk_receive_queue.lock);
1578         if (po->pg_vec) {
1579                 unsigned last = po->head ? po->head-1 : po->frame_max;
1580                 struct tpacket_hdr *h;
1581
1582                 h = packet_lookup_frame(po, last);
1583
1584                 if (h->tp_status)
1585                         mask |= POLLIN | POLLRDNORM;
1586         }
1587         spin_unlock_bh(&sk->sk_receive_queue.lock);
1588         return mask;
1589 }
1590
1591
1592 /* Dirty? Well, I still did not learn better way to account
1593  * for user mmaps.
1594  */
1595
1596 static void packet_mm_open(struct vm_area_struct *vma)
1597 {
1598         struct file *file = vma->vm_file;
1599         struct socket * sock = file->private_data;
1600         struct sock *sk = sock->sk;
1601
1602         if (sk)
1603                 atomic_inc(&pkt_sk(sk)->mapped);
1604 }
1605
1606 static void packet_mm_close(struct vm_area_struct *vma)
1607 {
1608         struct file *file = vma->vm_file;
1609         struct socket * sock = file->private_data;
1610         struct sock *sk = sock->sk;
1611
1612         if (sk)
1613                 atomic_dec(&pkt_sk(sk)->mapped);
1614 }
1615
1616 static struct vm_operations_struct packet_mmap_ops = {
1617         .open = packet_mm_open,
1618         .close =packet_mm_close,
1619 };
1620
1621 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1622 {
1623         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1624 }
1625
1626 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1627 {
1628         int i;
1629
1630         for (i = 0; i < len; i++) {
1631                 if (likely(pg_vec[i]))
1632                         free_pages((unsigned long) pg_vec[i], order);
1633         }
1634         kfree(pg_vec);
1635 }
1636
1637 static inline char *alloc_one_pg_vec_page(unsigned long order)
1638 {
1639         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1640                                          order);
1641 }
1642
1643 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1644 {
1645         unsigned int block_nr = req->tp_block_nr;
1646         char **pg_vec;
1647         int i;
1648
1649         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1650         if (unlikely(!pg_vec))
1651                 goto out;
1652
1653         for (i = 0; i < block_nr; i++) {
1654                 pg_vec[i] = alloc_one_pg_vec_page(order);
1655                 if (unlikely(!pg_vec[i]))
1656                         goto out_free_pgvec;
1657         }
1658
1659 out:
1660         return pg_vec;
1661
1662 out_free_pgvec:
1663         free_pg_vec(pg_vec, order, block_nr);
1664         pg_vec = NULL;
1665         goto out;
1666 }
1667
1668 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1669 {
1670         char **pg_vec = NULL;
1671         struct packet_sock *po = pkt_sk(sk);
1672         int was_running, order = 0;
1673         __be16 num;
1674         int err = 0;
1675
1676         if (req->tp_block_nr) {
1677                 int i, l;
1678
1679                 /* Sanity tests and some calculations */
1680
1681                 if (unlikely(po->pg_vec))
1682                         return -EBUSY;
1683
1684                 if (unlikely((int)req->tp_block_size <= 0))
1685                         return -EINVAL;
1686                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1687                         return -EINVAL;
1688                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1689                         return -EINVAL;
1690                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1691                         return -EINVAL;
1692
1693                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1694                 if (unlikely(po->frames_per_block <= 0))
1695                         return -EINVAL;
1696                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1697                              req->tp_frame_nr))
1698                         return -EINVAL;
1699
1700                 err = -ENOMEM;
1701                 order = get_order(req->tp_block_size);
1702                 pg_vec = alloc_pg_vec(req, order);
1703                 if (unlikely(!pg_vec))
1704                         goto out;
1705
1706                 l = 0;
1707                 for (i = 0; i < req->tp_block_nr; i++) {
1708                         char *ptr = pg_vec[i];
1709                         struct tpacket_hdr *header;
1710                         int k;
1711
1712                         for (k = 0; k < po->frames_per_block; k++) {
1713                                 header = (struct tpacket_hdr *) ptr;
1714                                 header->tp_status = TP_STATUS_KERNEL;
1715                                 ptr += req->tp_frame_size;
1716                         }
1717                 }
1718                 /* Done */
1719         } else {
1720                 if (unlikely(req->tp_frame_nr))
1721                         return -EINVAL;
1722         }
1723
1724         lock_sock(sk);
1725
1726         /* Detach socket from network */
1727         spin_lock(&po->bind_lock);
1728         was_running = po->running;
1729         num = po->num;
1730         if (was_running) {
1731                 __dev_remove_pack(&po->prot_hook);
1732                 po->num = 0;
1733                 po->running = 0;
1734                 __sock_put(sk);
1735         }
1736         spin_unlock(&po->bind_lock);
1737
1738         synchronize_net();
1739
1740         err = -EBUSY;
1741         if (closing || atomic_read(&po->mapped) == 0) {
1742                 err = 0;
1743 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1744
1745                 spin_lock_bh(&sk->sk_receive_queue.lock);
1746                 pg_vec = XC(po->pg_vec, pg_vec);
1747                 po->frame_max = (req->tp_frame_nr - 1);
1748                 po->head = 0;
1749                 po->frame_size = req->tp_frame_size;
1750                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1751
1752                 order = XC(po->pg_vec_order, order);
1753                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1754
1755                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1756                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1757                 skb_queue_purge(&sk->sk_receive_queue);
1758 #undef XC
1759                 if (atomic_read(&po->mapped))
1760                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1761         }
1762
1763         spin_lock(&po->bind_lock);
1764         if (was_running && !po->running) {
1765                 sock_hold(sk);
1766                 po->running = 1;
1767                 po->num = num;
1768                 dev_add_pack(&po->prot_hook);
1769         }
1770         spin_unlock(&po->bind_lock);
1771
1772         release_sock(sk);
1773
1774         if (pg_vec)
1775                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1776 out:
1777         return err;
1778 }
1779
1780 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1781 {
1782         struct sock *sk = sock->sk;
1783         struct packet_sock *po = pkt_sk(sk);
1784         unsigned long size;
1785         unsigned long start;
1786         int err = -EINVAL;
1787         int i;
1788
1789         if (vma->vm_pgoff)
1790                 return -EINVAL;
1791
1792         size = vma->vm_end - vma->vm_start;
1793
1794         lock_sock(sk);
1795         if (po->pg_vec == NULL)
1796                 goto out;
1797         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1798                 goto out;
1799
1800         start = vma->vm_start;
1801         for (i = 0; i < po->pg_vec_len; i++) {
1802                 struct page *page = virt_to_page(po->pg_vec[i]);
1803                 int pg_num;
1804
1805                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1806                         err = vm_insert_page(vma, start, page);
1807                         if (unlikely(err))
1808                                 goto out;
1809                         start += PAGE_SIZE;
1810                 }
1811         }
1812         atomic_inc(&po->mapped);
1813         vma->vm_ops = &packet_mmap_ops;
1814         err = 0;
1815
1816 out:
1817         release_sock(sk);
1818         return err;
1819 }
1820 #endif
1821
1822
1823 static const struct proto_ops packet_ops_spkt = {
1824         .family =       PF_PACKET,
1825         .owner =        THIS_MODULE,
1826         .release =      packet_release,
1827         .bind =         packet_bind_spkt,
1828         .connect =      sock_no_connect,
1829         .socketpair =   sock_no_socketpair,
1830         .accept =       sock_no_accept,
1831         .getname =      packet_getname_spkt,
1832         .poll =         datagram_poll,
1833         .ioctl =        packet_ioctl,
1834         .listen =       sock_no_listen,
1835         .shutdown =     sock_no_shutdown,
1836         .setsockopt =   sock_no_setsockopt,
1837         .getsockopt =   sock_no_getsockopt,
1838         .sendmsg =      packet_sendmsg_spkt,
1839         .recvmsg =      packet_recvmsg,
1840         .mmap =         sock_no_mmap,
1841         .sendpage =     sock_no_sendpage,
1842 };
1843
1844 static const struct proto_ops packet_ops = {
1845         .family =       PF_PACKET,
1846         .owner =        THIS_MODULE,
1847         .release =      packet_release,
1848         .bind =         packet_bind,
1849         .connect =      sock_no_connect,
1850         .socketpair =   sock_no_socketpair,
1851         .accept =       sock_no_accept,
1852         .getname =      packet_getname,
1853         .poll =         packet_poll,
1854         .ioctl =        packet_ioctl,
1855         .listen =       sock_no_listen,
1856         .shutdown =     sock_no_shutdown,
1857         .setsockopt =   packet_setsockopt,
1858         .getsockopt =   packet_getsockopt,
1859         .sendmsg =      packet_sendmsg,
1860         .recvmsg =      packet_recvmsg,
1861         .mmap =         packet_mmap,
1862         .sendpage =     sock_no_sendpage,
1863 };
1864
1865 static struct net_proto_family packet_family_ops = {
1866         .family =       PF_PACKET,
1867         .create =       packet_create,
1868         .owner  =       THIS_MODULE,
1869 };
1870
1871 static struct notifier_block packet_netdev_notifier = {
1872         .notifier_call =packet_notifier,
1873 };
1874
1875 #ifdef CONFIG_PROC_FS
1876 static inline struct sock *packet_seq_idx(loff_t off)
1877 {
1878         struct sock *s;
1879         struct hlist_node *node;
1880
1881         sk_for_each(s, node, &packet_sklist) {
1882                 if (!off--)
1883                         return s;
1884         }
1885         return NULL;
1886 }
1887
1888 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1889 {
1890         read_lock(&packet_sklist_lock);
1891         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1892 }
1893
1894 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1895 {
1896         ++*pos;
1897         return  (v == SEQ_START_TOKEN)
1898                 ? sk_head(&packet_sklist)
1899                 : sk_next((struct sock*)v) ;
1900 }
1901
1902 static void packet_seq_stop(struct seq_file *seq, void *v)
1903 {
1904         read_unlock(&packet_sklist_lock);
1905 }
1906
1907 static int packet_seq_show(struct seq_file *seq, void *v)
1908 {
1909         if (v == SEQ_START_TOKEN)
1910                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1911         else {
1912                 struct sock *s = v;
1913                 const struct packet_sock *po = pkt_sk(s);
1914
1915                 seq_printf(seq,
1916                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1917                            s,
1918                            atomic_read(&s->sk_refcnt),
1919                            s->sk_type,
1920                            ntohs(po->num),
1921                            po->ifindex,
1922                            po->running,
1923                            atomic_read(&s->sk_rmem_alloc),
1924                            sock_i_uid(s),
1925                            sock_i_ino(s) );
1926         }
1927
1928         return 0;
1929 }
1930
1931 static struct seq_operations packet_seq_ops = {
1932         .start  = packet_seq_start,
1933         .next   = packet_seq_next,
1934         .stop   = packet_seq_stop,
1935         .show   = packet_seq_show,
1936 };
1937
1938 static int packet_seq_open(struct inode *inode, struct file *file)
1939 {
1940         return seq_open(file, &packet_seq_ops);
1941 }
1942
1943 static const struct file_operations packet_seq_fops = {
1944         .owner          = THIS_MODULE,
1945         .open           = packet_seq_open,
1946         .read           = seq_read,
1947         .llseek         = seq_lseek,
1948         .release        = seq_release,
1949 };
1950
1951 #endif
1952
1953 static void __exit packet_exit(void)
1954 {
1955         proc_net_remove("packet");
1956         unregister_netdevice_notifier(&packet_netdev_notifier);
1957         sock_unregister(PF_PACKET);
1958         proto_unregister(&packet_proto);
1959 }
1960
1961 static int __init packet_init(void)
1962 {
1963         int rc = proto_register(&packet_proto, 0);
1964
1965         if (rc != 0)
1966                 goto out;
1967
1968         sock_register(&packet_family_ops);
1969         register_netdevice_notifier(&packet_netdev_notifier);
1970         proc_net_fops_create("packet", 0, &packet_seq_fops);
1971 out:
1972         return rc;
1973 }
1974
1975 module_init(packet_init);
1976 module_exit(packet_exit);
1977 MODULE_LICENSE("GPL");
1978 MODULE_ALIAS_NETPROTO(PF_PACKET);