ipvs: No need to zero out ip_vs_stats during initialization
[linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *
43  *              This program is free software; you can redistribute it and/or
44  *              modify it under the terms of the GNU General Public License
45  *              as published by the Free Software Foundation; either version
46  *              2 of the License, or (at your option) any later version.
47  *
48  */
49
50 #include <linux/types.h>
51 #include <linux/mm.h>
52 #include <linux/capability.h>
53 #include <linux/fcntl.h>
54 #include <linux/socket.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/if_packet.h>
59 #include <linux/wireless.h>
60 #include <linux/kernel.h>
61 #include <linux/kmod.h>
62 #include <net/net_namespace.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <linux/skbuff.h>
66 #include <net/sock.h>
67 #include <linux/errno.h>
68 #include <linux/timer.h>
69 #include <asm/system.h>
70 #include <asm/uaccess.h>
71 #include <asm/ioctls.h>
72 #include <asm/page.h>
73 #include <asm/cacheflush.h>
74 #include <asm/io.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80
81 #ifdef CONFIG_INET
82 #include <net/inet_common.h>
83 #endif
84
85 /*
86    Assumptions:
87    - if device has no dev->hard_header routine, it adds and removes ll header
88      inside itself. In this case ll header is invisible outside of device,
89      but higher levels still should reserve dev->hard_header_len.
90      Some devices are enough clever to reallocate skb, when header
91      will not fit to reserved space (tunnel), another ones are silly
92      (PPP).
93    - packet socket receives packets with pulled ll header,
94      so that SOCK_RAW should push it back.
95
96 On receive:
97 -----------
98
99 Incoming, dev->hard_header!=NULL
100    mac_header -> ll header
101    data       -> data
102
103 Outgoing, dev->hard_header!=NULL
104    mac_header -> ll header
105    data       -> ll header
106
107 Incoming, dev->hard_header==NULL
108    mac_header -> UNKNOWN position. It is very likely, that it points to ll
109                  header.  PPP makes it, that is wrong, because introduce
110                  assymetry between rx and tx paths.
111    data       -> data
112
113 Outgoing, dev->hard_header==NULL
114    mac_header -> data. ll header is still not built!
115    data       -> data
116
117 Resume
118   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
119
120
121 On transmit:
122 ------------
123
124 dev->hard_header != NULL
125    mac_header -> ll header
126    data       -> ll header
127
128 dev->hard_header == NULL (ll header is added by device, we cannot control it)
129    mac_header -> data
130    data       -> data
131
132    We should set nh.raw on output to correct posistion,
133    packet classifier depends on it.
134  */
135
136 /* Private packet socket structures. */
137
138 struct packet_mclist
139 {
140         struct packet_mclist    *next;
141         int                     ifindex;
142         int                     count;
143         unsigned short          type;
144         unsigned short          alen;
145         unsigned char           addr[MAX_ADDR_LEN];
146 };
147 /* identical to struct packet_mreq except it has
148  * a longer address field.
149  */
150 struct packet_mreq_max
151 {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
160 #endif
161
162 static void packet_flush_mclist(struct sock *sk);
163
164 struct packet_sock {
165         /* struct sock has to be the first member of packet_sock */
166         struct sock             sk;
167         struct tpacket_stats    stats;
168 #ifdef CONFIG_PACKET_MMAP
169         char *                  *pg_vec;
170         unsigned int            head;
171         unsigned int            frames_per_block;
172         unsigned int            frame_size;
173         unsigned int            frame_max;
174         int                     copy_thresh;
175 #endif
176         struct packet_type      prot_hook;
177         spinlock_t              bind_lock;
178         unsigned int            running:1,      /* prot_hook is attached*/
179                                 auxdata:1,
180                                 origdev:1;
181         int                     ifindex;        /* bound device         */
182         __be16                  num;
183         struct packet_mclist    *mclist;
184 #ifdef CONFIG_PACKET_MMAP
185         atomic_t                mapped;
186         unsigned int            pg_vec_order;
187         unsigned int            pg_vec_pages;
188         unsigned int            pg_vec_len;
189         enum tpacket_versions   tp_version;
190         unsigned int            tp_hdrlen;
191         unsigned int            tp_reserve;
192 #endif
193 };
194
195 struct packet_skb_cb {
196         unsigned int origlen;
197         union {
198                 struct sockaddr_pkt pkt;
199                 struct sockaddr_ll ll;
200         } sa;
201 };
202
203 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
204
205 #ifdef CONFIG_PACKET_MMAP
206
207 static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
208                                  int status)
209 {
210         unsigned int pg_vec_pos, frame_offset;
211         union {
212                 struct tpacket_hdr *h1;
213                 struct tpacket2_hdr *h2;
214                 void *raw;
215         } h;
216
217         pg_vec_pos = position / po->frames_per_block;
218         frame_offset = position % po->frames_per_block;
219
220         h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
221         switch (po->tp_version) {
222         case TPACKET_V1:
223                 if (status != h.h1->tp_status ? TP_STATUS_USER :
224                                                 TP_STATUS_KERNEL)
225                         return NULL;
226                 break;
227         case TPACKET_V2:
228                 if (status != h.h2->tp_status ? TP_STATUS_USER :
229                                                 TP_STATUS_KERNEL)
230                         return NULL;
231                 break;
232         }
233         return h.raw;
234 }
235
236 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
237 {
238         union {
239                 struct tpacket_hdr *h1;
240                 struct tpacket2_hdr *h2;
241                 void *raw;
242         } h;
243
244         h.raw = frame;
245         switch (po->tp_version) {
246         case TPACKET_V1:
247                 h.h1->tp_status = status;
248                 break;
249         case TPACKET_V2:
250                 h.h2->tp_status = status;
251                 break;
252         }
253 }
254 #endif
255
256 static inline struct packet_sock *pkt_sk(struct sock *sk)
257 {
258         return (struct packet_sock *)sk;
259 }
260
261 static void packet_sock_destruct(struct sock *sk)
262 {
263         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
264         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
265
266         if (!sock_flag(sk, SOCK_DEAD)) {
267                 printk("Attempt to release alive packet socket: %p\n", sk);
268                 return;
269         }
270
271         sk_refcnt_debug_dec(sk);
272 }
273
274
275 static const struct proto_ops packet_ops;
276
277 static const struct proto_ops packet_ops_spkt;
278
279 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
280 {
281         struct sock *sk;
282         struct sockaddr_pkt *spkt;
283
284         /*
285          *      When we registered the protocol we saved the socket in the data
286          *      field for just this event.
287          */
288
289         sk = pt->af_packet_priv;
290
291         /*
292          *      Yank back the headers [hope the device set this
293          *      right or kerboom...]
294          *
295          *      Incoming packets have ll header pulled,
296          *      push it back.
297          *
298          *      For outgoing ones skb->data == skb_mac_header(skb)
299          *      so that this procedure is noop.
300          */
301
302         if (skb->pkt_type == PACKET_LOOPBACK)
303                 goto out;
304
305         if (dev_net(dev) != sock_net(sk))
306                 goto out;
307
308         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
309                 goto oom;
310
311         /* drop any routing info */
312         dst_release(skb->dst);
313         skb->dst = NULL;
314
315         /* drop conntrack reference */
316         nf_reset(skb);
317
318         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
319
320         skb_push(skb, skb->data - skb_mac_header(skb));
321
322         /*
323          *      The SOCK_PACKET socket receives _all_ frames.
324          */
325
326         spkt->spkt_family = dev->type;
327         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
328         spkt->spkt_protocol = skb->protocol;
329
330         /*
331          *      Charge the memory to the socket. This is done specifically
332          *      to prevent sockets using all the memory up.
333          */
334
335         if (sock_queue_rcv_skb(sk,skb) == 0)
336                 return 0;
337
338 out:
339         kfree_skb(skb);
340 oom:
341         return 0;
342 }
343
344
345 /*
346  *      Output a raw packet to a device layer. This bypasses all the other
347  *      protocol layers and you must therefore supply it with a complete frame
348  */
349
350 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
351                                struct msghdr *msg, size_t len)
352 {
353         struct sock *sk = sock->sk;
354         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
355         struct sk_buff *skb;
356         struct net_device *dev;
357         __be16 proto=0;
358         int err;
359
360         /*
361          *      Get and verify the address.
362          */
363
364         if (saddr)
365         {
366                 if (msg->msg_namelen < sizeof(struct sockaddr))
367                         return(-EINVAL);
368                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
369                         proto=saddr->spkt_protocol;
370         }
371         else
372                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
373
374         /*
375          *      Find the device first to size check it
376          */
377
378         saddr->spkt_device[13] = 0;
379         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
380         err = -ENODEV;
381         if (dev == NULL)
382                 goto out_unlock;
383
384         err = -ENETDOWN;
385         if (!(dev->flags & IFF_UP))
386                 goto out_unlock;
387
388         /*
389          *      You may not queue a frame bigger than the mtu. This is the lowest level
390          *      raw protocol and you must do your own fragmentation at this level.
391          */
392
393         err = -EMSGSIZE;
394         if (len > dev->mtu + dev->hard_header_len)
395                 goto out_unlock;
396
397         err = -ENOBUFS;
398         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
399
400         /*
401          *      If the write buffer is full, then tough. At this level the user gets to
402          *      deal with the problem - do your own algorithmic backoffs. That's far
403          *      more flexible.
404          */
405
406         if (skb == NULL)
407                 goto out_unlock;
408
409         /*
410          *      Fill it in
411          */
412
413         /* FIXME: Save some space for broken drivers that write a
414          * hard header at transmission time by themselves. PPP is the
415          * notable one here. This should really be fixed at the driver level.
416          */
417         skb_reserve(skb, LL_RESERVED_SPACE(dev));
418         skb_reset_network_header(skb);
419
420         /* Try to align data part correctly */
421         if (dev->header_ops) {
422                 skb->data -= dev->hard_header_len;
423                 skb->tail -= dev->hard_header_len;
424                 if (len < dev->hard_header_len)
425                         skb_reset_network_header(skb);
426         }
427
428         /* Returns -EFAULT on error */
429         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
430         skb->protocol = proto;
431         skb->dev = dev;
432         skb->priority = sk->sk_priority;
433         if (err)
434                 goto out_free;
435
436         /*
437          *      Now send it
438          */
439
440         dev_queue_xmit(skb);
441         dev_put(dev);
442         return(len);
443
444 out_free:
445         kfree_skb(skb);
446 out_unlock:
447         if (dev)
448                 dev_put(dev);
449         return err;
450 }
451
452 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
453                                       unsigned int res)
454 {
455         struct sk_filter *filter;
456
457         rcu_read_lock_bh();
458         filter = rcu_dereference(sk->sk_filter);
459         if (filter != NULL)
460                 res = sk_run_filter(skb, filter->insns, filter->len);
461         rcu_read_unlock_bh();
462
463         return res;
464 }
465
466 /*
467    This function makes lazy skb cloning in hope that most of packets
468    are discarded by BPF.
469
470    Note tricky part: we DO mangle shared skb! skb->data, skb->len
471    and skb->cb are mangled. It works because (and until) packets
472    falling here are owned by current CPU. Output packets are cloned
473    by dev_queue_xmit_nit(), input packets are processed by net_bh
474    sequencially, so that if we return skb to original state on exit,
475    we will not harm anyone.
476  */
477
478 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
479 {
480         struct sock *sk;
481         struct sockaddr_ll *sll;
482         struct packet_sock *po;
483         u8 * skb_head = skb->data;
484         int skb_len = skb->len;
485         unsigned int snaplen, res;
486
487         if (skb->pkt_type == PACKET_LOOPBACK)
488                 goto drop;
489
490         sk = pt->af_packet_priv;
491         po = pkt_sk(sk);
492
493         if (dev_net(dev) != sock_net(sk))
494                 goto drop;
495
496         skb->dev = dev;
497
498         if (dev->header_ops) {
499                 /* The device has an explicit notion of ll header,
500                    exported to higher levels.
501
502                    Otherwise, the device hides datails of it frame
503                    structure, so that corresponding packet head
504                    never delivered to user.
505                  */
506                 if (sk->sk_type != SOCK_DGRAM)
507                         skb_push(skb, skb->data - skb_mac_header(skb));
508                 else if (skb->pkt_type == PACKET_OUTGOING) {
509                         /* Special case: outgoing packets have ll header at head */
510                         skb_pull(skb, skb_network_offset(skb));
511                 }
512         }
513
514         snaplen = skb->len;
515
516         res = run_filter(skb, sk, snaplen);
517         if (!res)
518                 goto drop_n_restore;
519         if (snaplen > res)
520                 snaplen = res;
521
522         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
523             (unsigned)sk->sk_rcvbuf)
524                 goto drop_n_acct;
525
526         if (skb_shared(skb)) {
527                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
528                 if (nskb == NULL)
529                         goto drop_n_acct;
530
531                 if (skb_head != skb->data) {
532                         skb->data = skb_head;
533                         skb->len = skb_len;
534                 }
535                 kfree_skb(skb);
536                 skb = nskb;
537         }
538
539         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
540                      sizeof(skb->cb));
541
542         sll = &PACKET_SKB_CB(skb)->sa.ll;
543         sll->sll_family = AF_PACKET;
544         sll->sll_hatype = dev->type;
545         sll->sll_protocol = skb->protocol;
546         sll->sll_pkttype = skb->pkt_type;
547         if (unlikely(po->origdev))
548                 sll->sll_ifindex = orig_dev->ifindex;
549         else
550                 sll->sll_ifindex = dev->ifindex;
551
552         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
553
554         PACKET_SKB_CB(skb)->origlen = skb->len;
555
556         if (pskb_trim(skb, snaplen))
557                 goto drop_n_acct;
558
559         skb_set_owner_r(skb, sk);
560         skb->dev = NULL;
561         dst_release(skb->dst);
562         skb->dst = NULL;
563
564         /* drop conntrack reference */
565         nf_reset(skb);
566
567         spin_lock(&sk->sk_receive_queue.lock);
568         po->stats.tp_packets++;
569         __skb_queue_tail(&sk->sk_receive_queue, skb);
570         spin_unlock(&sk->sk_receive_queue.lock);
571         sk->sk_data_ready(sk, skb->len);
572         return 0;
573
574 drop_n_acct:
575         spin_lock(&sk->sk_receive_queue.lock);
576         po->stats.tp_drops++;
577         spin_unlock(&sk->sk_receive_queue.lock);
578
579 drop_n_restore:
580         if (skb_head != skb->data && skb_shared(skb)) {
581                 skb->data = skb_head;
582                 skb->len = skb_len;
583         }
584 drop:
585         kfree_skb(skb);
586         return 0;
587 }
588
589 #ifdef CONFIG_PACKET_MMAP
590 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
591 {
592         struct sock *sk;
593         struct packet_sock *po;
594         struct sockaddr_ll *sll;
595         union {
596                 struct tpacket_hdr *h1;
597                 struct tpacket2_hdr *h2;
598                 void *raw;
599         } h;
600         u8 * skb_head = skb->data;
601         int skb_len = skb->len;
602         unsigned int snaplen, res;
603         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
604         unsigned short macoff, netoff, hdrlen;
605         struct sk_buff *copy_skb = NULL;
606         struct timeval tv;
607         struct timespec ts;
608
609         if (skb->pkt_type == PACKET_LOOPBACK)
610                 goto drop;
611
612         sk = pt->af_packet_priv;
613         po = pkt_sk(sk);
614
615         if (dev_net(dev) != sock_net(sk))
616                 goto drop;
617
618         if (dev->header_ops) {
619                 if (sk->sk_type != SOCK_DGRAM)
620                         skb_push(skb, skb->data - skb_mac_header(skb));
621                 else if (skb->pkt_type == PACKET_OUTGOING) {
622                         /* Special case: outgoing packets have ll header at head */
623                         skb_pull(skb, skb_network_offset(skb));
624                 }
625         }
626
627         if (skb->ip_summed == CHECKSUM_PARTIAL)
628                 status |= TP_STATUS_CSUMNOTREADY;
629
630         snaplen = skb->len;
631
632         res = run_filter(skb, sk, snaplen);
633         if (!res)
634                 goto drop_n_restore;
635         if (snaplen > res)
636                 snaplen = res;
637
638         if (sk->sk_type == SOCK_DGRAM) {
639                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
640                                   po->tp_reserve;
641         } else {
642                 unsigned maclen = skb_network_offset(skb);
643                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
644                                        (maclen < 16 ? 16 : maclen)) +
645                         po->tp_reserve;
646                 macoff = netoff - maclen;
647         }
648
649         if (macoff + snaplen > po->frame_size) {
650                 if (po->copy_thresh &&
651                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
652                     (unsigned)sk->sk_rcvbuf) {
653                         if (skb_shared(skb)) {
654                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
655                         } else {
656                                 copy_skb = skb_get(skb);
657                                 skb_head = skb->data;
658                         }
659                         if (copy_skb)
660                                 skb_set_owner_r(copy_skb, sk);
661                 }
662                 snaplen = po->frame_size - macoff;
663                 if ((int)snaplen < 0)
664                         snaplen = 0;
665         }
666
667         spin_lock(&sk->sk_receive_queue.lock);
668         h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
669         if (!h.raw)
670                 goto ring_is_full;
671         po->head = po->head != po->frame_max ? po->head+1 : 0;
672         po->stats.tp_packets++;
673         if (copy_skb) {
674                 status |= TP_STATUS_COPY;
675                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
676         }
677         if (!po->stats.tp_drops)
678                 status &= ~TP_STATUS_LOSING;
679         spin_unlock(&sk->sk_receive_queue.lock);
680
681         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
682
683         switch (po->tp_version) {
684         case TPACKET_V1:
685                 h.h1->tp_len = skb->len;
686                 h.h1->tp_snaplen = snaplen;
687                 h.h1->tp_mac = macoff;
688                 h.h1->tp_net = netoff;
689                 if (skb->tstamp.tv64)
690                         tv = ktime_to_timeval(skb->tstamp);
691                 else
692                         do_gettimeofday(&tv);
693                 h.h1->tp_sec = tv.tv_sec;
694                 h.h1->tp_usec = tv.tv_usec;
695                 hdrlen = sizeof(*h.h1);
696                 break;
697         case TPACKET_V2:
698                 h.h2->tp_len = skb->len;
699                 h.h2->tp_snaplen = snaplen;
700                 h.h2->tp_mac = macoff;
701                 h.h2->tp_net = netoff;
702                 if (skb->tstamp.tv64)
703                         ts = ktime_to_timespec(skb->tstamp);
704                 else
705                         getnstimeofday(&ts);
706                 h.h2->tp_sec = ts.tv_sec;
707                 h.h2->tp_nsec = ts.tv_nsec;
708                 h.h2->tp_vlan_tci = skb->vlan_tci;
709                 hdrlen = sizeof(*h.h2);
710                 break;
711         default:
712                 BUG();
713         }
714
715         sll = h.raw + TPACKET_ALIGN(hdrlen);
716         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
717         sll->sll_family = AF_PACKET;
718         sll->sll_hatype = dev->type;
719         sll->sll_protocol = skb->protocol;
720         sll->sll_pkttype = skb->pkt_type;
721         if (unlikely(po->origdev))
722                 sll->sll_ifindex = orig_dev->ifindex;
723         else
724                 sll->sll_ifindex = dev->ifindex;
725
726         __packet_set_status(po, h.raw, status);
727         smp_mb();
728
729         {
730                 struct page *p_start, *p_end;
731                 u8 *h_end = h.raw + macoff + snaplen - 1;
732
733                 p_start = virt_to_page(h.raw);
734                 p_end = virt_to_page(h_end);
735                 while (p_start <= p_end) {
736                         flush_dcache_page(p_start);
737                         p_start++;
738                 }
739         }
740
741         sk->sk_data_ready(sk, 0);
742
743 drop_n_restore:
744         if (skb_head != skb->data && skb_shared(skb)) {
745                 skb->data = skb_head;
746                 skb->len = skb_len;
747         }
748 drop:
749         kfree_skb(skb);
750         return 0;
751
752 ring_is_full:
753         po->stats.tp_drops++;
754         spin_unlock(&sk->sk_receive_queue.lock);
755
756         sk->sk_data_ready(sk, 0);
757         if (copy_skb)
758                 kfree_skb(copy_skb);
759         goto drop_n_restore;
760 }
761
762 #endif
763
764
765 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
766                           struct msghdr *msg, size_t len)
767 {
768         struct sock *sk = sock->sk;
769         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
770         struct sk_buff *skb;
771         struct net_device *dev;
772         __be16 proto;
773         unsigned char *addr;
774         int ifindex, err, reserve = 0;
775
776         /*
777          *      Get and verify the address.
778          */
779
780         if (saddr == NULL) {
781                 struct packet_sock *po = pkt_sk(sk);
782
783                 ifindex = po->ifindex;
784                 proto   = po->num;
785                 addr    = NULL;
786         } else {
787                 err = -EINVAL;
788                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
789                         goto out;
790                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
791                         goto out;
792                 ifindex = saddr->sll_ifindex;
793                 proto   = saddr->sll_protocol;
794                 addr    = saddr->sll_addr;
795         }
796
797
798         dev = dev_get_by_index(sock_net(sk), ifindex);
799         err = -ENXIO;
800         if (dev == NULL)
801                 goto out_unlock;
802         if (sock->type == SOCK_RAW)
803                 reserve = dev->hard_header_len;
804
805         err = -ENETDOWN;
806         if (!(dev->flags & IFF_UP))
807                 goto out_unlock;
808
809         err = -EMSGSIZE;
810         if (len > dev->mtu+reserve)
811                 goto out_unlock;
812
813         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
814                                 msg->msg_flags & MSG_DONTWAIT, &err);
815         if (skb==NULL)
816                 goto out_unlock;
817
818         skb_reserve(skb, LL_RESERVED_SPACE(dev));
819         skb_reset_network_header(skb);
820
821         err = -EINVAL;
822         if (sock->type == SOCK_DGRAM &&
823             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
824                 goto out_free;
825
826         /* Returns -EFAULT on error */
827         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
828         if (err)
829                 goto out_free;
830
831         skb->protocol = proto;
832         skb->dev = dev;
833         skb->priority = sk->sk_priority;
834
835         /*
836          *      Now send it
837          */
838
839         err = dev_queue_xmit(skb);
840         if (err > 0 && (err = net_xmit_errno(err)) != 0)
841                 goto out_unlock;
842
843         dev_put(dev);
844
845         return(len);
846
847 out_free:
848         kfree_skb(skb);
849 out_unlock:
850         if (dev)
851                 dev_put(dev);
852 out:
853         return err;
854 }
855
856 /*
857  *      Close a PACKET socket. This is fairly simple. We immediately go
858  *      to 'closed' state and remove our protocol entry in the device list.
859  */
860
861 static int packet_release(struct socket *sock)
862 {
863         struct sock *sk = sock->sk;
864         struct packet_sock *po;
865         struct net *net;
866
867         if (!sk)
868                 return 0;
869
870         net = sock_net(sk);
871         po = pkt_sk(sk);
872
873         write_lock_bh(&net->packet.sklist_lock);
874         sk_del_node_init(sk);
875         write_unlock_bh(&net->packet.sklist_lock);
876
877         /*
878          *      Unhook packet receive handler.
879          */
880
881         if (po->running) {
882                 /*
883                  *      Remove the protocol hook
884                  */
885                 dev_remove_pack(&po->prot_hook);
886                 po->running = 0;
887                 po->num = 0;
888                 __sock_put(sk);
889         }
890
891         packet_flush_mclist(sk);
892
893 #ifdef CONFIG_PACKET_MMAP
894         if (po->pg_vec) {
895                 struct tpacket_req req;
896                 memset(&req, 0, sizeof(req));
897                 packet_set_ring(sk, &req, 1);
898         }
899 #endif
900
901         /*
902          *      Now the socket is dead. No more input will appear.
903          */
904
905         sock_orphan(sk);
906         sock->sk = NULL;
907
908         /* Purge queues */
909
910         skb_queue_purge(&sk->sk_receive_queue);
911         sk_refcnt_debug_release(sk);
912
913         sock_put(sk);
914         return 0;
915 }
916
917 /*
918  *      Attach a packet hook.
919  */
920
921 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
922 {
923         struct packet_sock *po = pkt_sk(sk);
924         /*
925          *      Detach an existing hook if present.
926          */
927
928         lock_sock(sk);
929
930         spin_lock(&po->bind_lock);
931         if (po->running) {
932                 __sock_put(sk);
933                 po->running = 0;
934                 po->num = 0;
935                 spin_unlock(&po->bind_lock);
936                 dev_remove_pack(&po->prot_hook);
937                 spin_lock(&po->bind_lock);
938         }
939
940         po->num = protocol;
941         po->prot_hook.type = protocol;
942         po->prot_hook.dev = dev;
943
944         po->ifindex = dev ? dev->ifindex : 0;
945
946         if (protocol == 0)
947                 goto out_unlock;
948
949         if (!dev || (dev->flags & IFF_UP)) {
950                 dev_add_pack(&po->prot_hook);
951                 sock_hold(sk);
952                 po->running = 1;
953         } else {
954                 sk->sk_err = ENETDOWN;
955                 if (!sock_flag(sk, SOCK_DEAD))
956                         sk->sk_error_report(sk);
957         }
958
959 out_unlock:
960         spin_unlock(&po->bind_lock);
961         release_sock(sk);
962         return 0;
963 }
964
965 /*
966  *      Bind a packet socket to a device
967  */
968
969 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
970 {
971         struct sock *sk=sock->sk;
972         char name[15];
973         struct net_device *dev;
974         int err = -ENODEV;
975
976         /*
977          *      Check legality
978          */
979
980         if (addr_len != sizeof(struct sockaddr))
981                 return -EINVAL;
982         strlcpy(name,uaddr->sa_data,sizeof(name));
983
984         dev = dev_get_by_name(sock_net(sk), name);
985         if (dev) {
986                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
987                 dev_put(dev);
988         }
989         return err;
990 }
991
992 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
993 {
994         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
995         struct sock *sk=sock->sk;
996         struct net_device *dev = NULL;
997         int err;
998
999
1000         /*
1001          *      Check legality
1002          */
1003
1004         if (addr_len < sizeof(struct sockaddr_ll))
1005                 return -EINVAL;
1006         if (sll->sll_family != AF_PACKET)
1007                 return -EINVAL;
1008
1009         if (sll->sll_ifindex) {
1010                 err = -ENODEV;
1011                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1012                 if (dev == NULL)
1013                         goto out;
1014         }
1015         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1016         if (dev)
1017                 dev_put(dev);
1018
1019 out:
1020         return err;
1021 }
1022
1023 static struct proto packet_proto = {
1024         .name     = "PACKET",
1025         .owner    = THIS_MODULE,
1026         .obj_size = sizeof(struct packet_sock),
1027 };
1028
1029 /*
1030  *      Create a packet of type SOCK_PACKET.
1031  */
1032
1033 static int packet_create(struct net *net, struct socket *sock, int protocol)
1034 {
1035         struct sock *sk;
1036         struct packet_sock *po;
1037         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1038         int err;
1039
1040         if (!capable(CAP_NET_RAW))
1041                 return -EPERM;
1042         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1043             sock->type != SOCK_PACKET)
1044                 return -ESOCKTNOSUPPORT;
1045
1046         sock->state = SS_UNCONNECTED;
1047
1048         err = -ENOBUFS;
1049         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1050         if (sk == NULL)
1051                 goto out;
1052
1053         sock->ops = &packet_ops;
1054         if (sock->type == SOCK_PACKET)
1055                 sock->ops = &packet_ops_spkt;
1056
1057         sock_init_data(sock, sk);
1058
1059         po = pkt_sk(sk);
1060         sk->sk_family = PF_PACKET;
1061         po->num = proto;
1062
1063         sk->sk_destruct = packet_sock_destruct;
1064         sk_refcnt_debug_inc(sk);
1065
1066         /*
1067          *      Attach a protocol block
1068          */
1069
1070         spin_lock_init(&po->bind_lock);
1071         po->prot_hook.func = packet_rcv;
1072
1073         if (sock->type == SOCK_PACKET)
1074                 po->prot_hook.func = packet_rcv_spkt;
1075
1076         po->prot_hook.af_packet_priv = sk;
1077
1078         if (proto) {
1079                 po->prot_hook.type = proto;
1080                 dev_add_pack(&po->prot_hook);
1081                 sock_hold(sk);
1082                 po->running = 1;
1083         }
1084
1085         write_lock_bh(&net->packet.sklist_lock);
1086         sk_add_node(sk, &net->packet.sklist);
1087         write_unlock_bh(&net->packet.sklist_lock);
1088         return(0);
1089 out:
1090         return err;
1091 }
1092
1093 /*
1094  *      Pull a packet from our receive queue and hand it to the user.
1095  *      If necessary we block.
1096  */
1097
1098 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1099                           struct msghdr *msg, size_t len, int flags)
1100 {
1101         struct sock *sk = sock->sk;
1102         struct sk_buff *skb;
1103         int copied, err;
1104         struct sockaddr_ll *sll;
1105
1106         err = -EINVAL;
1107         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1108                 goto out;
1109
1110 #if 0
1111         /* What error should we return now? EUNATTACH? */
1112         if (pkt_sk(sk)->ifindex < 0)
1113                 return -ENODEV;
1114 #endif
1115
1116         /*
1117          *      Call the generic datagram receiver. This handles all sorts
1118          *      of horrible races and re-entrancy so we can forget about it
1119          *      in the protocol layers.
1120          *
1121          *      Now it will return ENETDOWN, if device have just gone down,
1122          *      but then it will block.
1123          */
1124
1125         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1126
1127         /*
1128          *      An error occurred so return it. Because skb_recv_datagram()
1129          *      handles the blocking we don't see and worry about blocking
1130          *      retries.
1131          */
1132
1133         if (skb == NULL)
1134                 goto out;
1135
1136         /*
1137          *      If the address length field is there to be filled in, we fill
1138          *      it in now.
1139          */
1140
1141         sll = &PACKET_SKB_CB(skb)->sa.ll;
1142         if (sock->type == SOCK_PACKET)
1143                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1144         else
1145                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1146
1147         /*
1148          *      You lose any data beyond the buffer you gave. If it worries a
1149          *      user program they can ask the device for its MTU anyway.
1150          */
1151
1152         copied = skb->len;
1153         if (copied > len)
1154         {
1155                 copied=len;
1156                 msg->msg_flags|=MSG_TRUNC;
1157         }
1158
1159         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1160         if (err)
1161                 goto out_free;
1162
1163         sock_recv_timestamp(msg, sk, skb);
1164
1165         if (msg->msg_name)
1166                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1167                        msg->msg_namelen);
1168
1169         if (pkt_sk(sk)->auxdata) {
1170                 struct tpacket_auxdata aux;
1171
1172                 aux.tp_status = TP_STATUS_USER;
1173                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1174                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1175                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1176                 aux.tp_snaplen = skb->len;
1177                 aux.tp_mac = 0;
1178                 aux.tp_net = skb_network_offset(skb);
1179                 aux.tp_vlan_tci = skb->vlan_tci;
1180
1181                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1182         }
1183
1184         /*
1185          *      Free or return the buffer as appropriate. Again this
1186          *      hides all the races and re-entrancy issues from us.
1187          */
1188         err = (flags&MSG_TRUNC) ? skb->len : copied;
1189
1190 out_free:
1191         skb_free_datagram(sk, skb);
1192 out:
1193         return err;
1194 }
1195
1196 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1197                                int *uaddr_len, int peer)
1198 {
1199         struct net_device *dev;
1200         struct sock *sk = sock->sk;
1201
1202         if (peer)
1203                 return -EOPNOTSUPP;
1204
1205         uaddr->sa_family = AF_PACKET;
1206         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1207         if (dev) {
1208                 strlcpy(uaddr->sa_data, dev->name, 15);
1209                 dev_put(dev);
1210         } else
1211                 memset(uaddr->sa_data, 0, 14);
1212         *uaddr_len = sizeof(*uaddr);
1213
1214         return 0;
1215 }
1216
1217 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1218                           int *uaddr_len, int peer)
1219 {
1220         struct net_device *dev;
1221         struct sock *sk = sock->sk;
1222         struct packet_sock *po = pkt_sk(sk);
1223         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1224
1225         if (peer)
1226                 return -EOPNOTSUPP;
1227
1228         sll->sll_family = AF_PACKET;
1229         sll->sll_ifindex = po->ifindex;
1230         sll->sll_protocol = po->num;
1231         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1232         if (dev) {
1233                 sll->sll_hatype = dev->type;
1234                 sll->sll_halen = dev->addr_len;
1235                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1236                 dev_put(dev);
1237         } else {
1238                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1239                 sll->sll_halen = 0;
1240         }
1241         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1242
1243         return 0;
1244 }
1245
1246 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1247                          int what)
1248 {
1249         switch (i->type) {
1250         case PACKET_MR_MULTICAST:
1251                 if (what > 0)
1252                         dev_mc_add(dev, i->addr, i->alen, 0);
1253                 else
1254                         dev_mc_delete(dev, i->addr, i->alen, 0);
1255                 break;
1256         case PACKET_MR_PROMISC:
1257                 return dev_set_promiscuity(dev, what);
1258                 break;
1259         case PACKET_MR_ALLMULTI:
1260                 return dev_set_allmulti(dev, what);
1261                 break;
1262         default:;
1263         }
1264         return 0;
1265 }
1266
1267 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1268 {
1269         for ( ; i; i=i->next) {
1270                 if (i->ifindex == dev->ifindex)
1271                         packet_dev_mc(dev, i, what);
1272         }
1273 }
1274
1275 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1276 {
1277         struct packet_sock *po = pkt_sk(sk);
1278         struct packet_mclist *ml, *i;
1279         struct net_device *dev;
1280         int err;
1281
1282         rtnl_lock();
1283
1284         err = -ENODEV;
1285         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1286         if (!dev)
1287                 goto done;
1288
1289         err = -EINVAL;
1290         if (mreq->mr_alen > dev->addr_len)
1291                 goto done;
1292
1293         err = -ENOBUFS;
1294         i = kmalloc(sizeof(*i), GFP_KERNEL);
1295         if (i == NULL)
1296                 goto done;
1297
1298         err = 0;
1299         for (ml = po->mclist; ml; ml = ml->next) {
1300                 if (ml->ifindex == mreq->mr_ifindex &&
1301                     ml->type == mreq->mr_type &&
1302                     ml->alen == mreq->mr_alen &&
1303                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1304                         ml->count++;
1305                         /* Free the new element ... */
1306                         kfree(i);
1307                         goto done;
1308                 }
1309         }
1310
1311         i->type = mreq->mr_type;
1312         i->ifindex = mreq->mr_ifindex;
1313         i->alen = mreq->mr_alen;
1314         memcpy(i->addr, mreq->mr_address, i->alen);
1315         i->count = 1;
1316         i->next = po->mclist;
1317         po->mclist = i;
1318         err = packet_dev_mc(dev, i, 1);
1319         if (err) {
1320                 po->mclist = i->next;
1321                 kfree(i);
1322         }
1323
1324 done:
1325         rtnl_unlock();
1326         return err;
1327 }
1328
1329 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1330 {
1331         struct packet_mclist *ml, **mlp;
1332
1333         rtnl_lock();
1334
1335         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1336                 if (ml->ifindex == mreq->mr_ifindex &&
1337                     ml->type == mreq->mr_type &&
1338                     ml->alen == mreq->mr_alen &&
1339                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1340                         if (--ml->count == 0) {
1341                                 struct net_device *dev;
1342                                 *mlp = ml->next;
1343                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1344                                 if (dev) {
1345                                         packet_dev_mc(dev, ml, -1);
1346                                         dev_put(dev);
1347                                 }
1348                                 kfree(ml);
1349                         }
1350                         rtnl_unlock();
1351                         return 0;
1352                 }
1353         }
1354         rtnl_unlock();
1355         return -EADDRNOTAVAIL;
1356 }
1357
1358 static void packet_flush_mclist(struct sock *sk)
1359 {
1360         struct packet_sock *po = pkt_sk(sk);
1361         struct packet_mclist *ml;
1362
1363         if (!po->mclist)
1364                 return;
1365
1366         rtnl_lock();
1367         while ((ml = po->mclist) != NULL) {
1368                 struct net_device *dev;
1369
1370                 po->mclist = ml->next;
1371                 if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1372                         packet_dev_mc(dev, ml, -1);
1373                         dev_put(dev);
1374                 }
1375                 kfree(ml);
1376         }
1377         rtnl_unlock();
1378 }
1379
1380 static int
1381 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1382 {
1383         struct sock *sk = sock->sk;
1384         struct packet_sock *po = pkt_sk(sk);
1385         int ret;
1386
1387         if (level != SOL_PACKET)
1388                 return -ENOPROTOOPT;
1389
1390         switch(optname) {
1391         case PACKET_ADD_MEMBERSHIP:
1392         case PACKET_DROP_MEMBERSHIP:
1393         {
1394                 struct packet_mreq_max mreq;
1395                 int len = optlen;
1396                 memset(&mreq, 0, sizeof(mreq));
1397                 if (len < sizeof(struct packet_mreq))
1398                         return -EINVAL;
1399                 if (len > sizeof(mreq))
1400                         len = sizeof(mreq);
1401                 if (copy_from_user(&mreq,optval,len))
1402                         return -EFAULT;
1403                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1404                         return -EINVAL;
1405                 if (optname == PACKET_ADD_MEMBERSHIP)
1406                         ret = packet_mc_add(sk, &mreq);
1407                 else
1408                         ret = packet_mc_drop(sk, &mreq);
1409                 return ret;
1410         }
1411
1412 #ifdef CONFIG_PACKET_MMAP
1413         case PACKET_RX_RING:
1414         {
1415                 struct tpacket_req req;
1416
1417                 if (optlen<sizeof(req))
1418                         return -EINVAL;
1419                 if (copy_from_user(&req,optval,sizeof(req)))
1420                         return -EFAULT;
1421                 return packet_set_ring(sk, &req, 0);
1422         }
1423         case PACKET_COPY_THRESH:
1424         {
1425                 int val;
1426
1427                 if (optlen!=sizeof(val))
1428                         return -EINVAL;
1429                 if (copy_from_user(&val,optval,sizeof(val)))
1430                         return -EFAULT;
1431
1432                 pkt_sk(sk)->copy_thresh = val;
1433                 return 0;
1434         }
1435         case PACKET_VERSION:
1436         {
1437                 int val;
1438
1439                 if (optlen != sizeof(val))
1440                         return -EINVAL;
1441                 if (po->pg_vec)
1442                         return -EBUSY;
1443                 if (copy_from_user(&val, optval, sizeof(val)))
1444                         return -EFAULT;
1445                 switch (val) {
1446                 case TPACKET_V1:
1447                 case TPACKET_V2:
1448                         po->tp_version = val;
1449                         return 0;
1450                 default:
1451                         return -EINVAL;
1452                 }
1453         }
1454         case PACKET_RESERVE:
1455         {
1456                 unsigned int val;
1457
1458                 if (optlen != sizeof(val))
1459                         return -EINVAL;
1460                 if (po->pg_vec)
1461                         return -EBUSY;
1462                 if (copy_from_user(&val, optval, sizeof(val)))
1463                         return -EFAULT;
1464                 po->tp_reserve = val;
1465                 return 0;
1466         }
1467 #endif
1468         case PACKET_AUXDATA:
1469         {
1470                 int val;
1471
1472                 if (optlen < sizeof(val))
1473                         return -EINVAL;
1474                 if (copy_from_user(&val, optval, sizeof(val)))
1475                         return -EFAULT;
1476
1477                 po->auxdata = !!val;
1478                 return 0;
1479         }
1480         case PACKET_ORIGDEV:
1481         {
1482                 int val;
1483
1484                 if (optlen < sizeof(val))
1485                         return -EINVAL;
1486                 if (copy_from_user(&val, optval, sizeof(val)))
1487                         return -EFAULT;
1488
1489                 po->origdev = !!val;
1490                 return 0;
1491         }
1492         default:
1493                 return -ENOPROTOOPT;
1494         }
1495 }
1496
1497 static int packet_getsockopt(struct socket *sock, int level, int optname,
1498                              char __user *optval, int __user *optlen)
1499 {
1500         int len;
1501         int val;
1502         struct sock *sk = sock->sk;
1503         struct packet_sock *po = pkt_sk(sk);
1504         void *data;
1505         struct tpacket_stats st;
1506
1507         if (level != SOL_PACKET)
1508                 return -ENOPROTOOPT;
1509
1510         if (get_user(len, optlen))
1511                 return -EFAULT;
1512
1513         if (len < 0)
1514                 return -EINVAL;
1515
1516         switch(optname) {
1517         case PACKET_STATISTICS:
1518                 if (len > sizeof(struct tpacket_stats))
1519                         len = sizeof(struct tpacket_stats);
1520                 spin_lock_bh(&sk->sk_receive_queue.lock);
1521                 st = po->stats;
1522                 memset(&po->stats, 0, sizeof(st));
1523                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1524                 st.tp_packets += st.tp_drops;
1525
1526                 data = &st;
1527                 break;
1528         case PACKET_AUXDATA:
1529                 if (len > sizeof(int))
1530                         len = sizeof(int);
1531                 val = po->auxdata;
1532
1533                 data = &val;
1534                 break;
1535         case PACKET_ORIGDEV:
1536                 if (len > sizeof(int))
1537                         len = sizeof(int);
1538                 val = po->origdev;
1539
1540                 data = &val;
1541                 break;
1542 #ifdef CONFIG_PACKET_MMAP
1543         case PACKET_VERSION:
1544                 if (len > sizeof(int))
1545                         len = sizeof(int);
1546                 val = po->tp_version;
1547                 data = &val;
1548                 break;
1549         case PACKET_HDRLEN:
1550                 if (len > sizeof(int))
1551                         len = sizeof(int);
1552                 if (copy_from_user(&val, optval, len))
1553                         return -EFAULT;
1554                 switch (val) {
1555                 case TPACKET_V1:
1556                         val = sizeof(struct tpacket_hdr);
1557                         break;
1558                 case TPACKET_V2:
1559                         val = sizeof(struct tpacket2_hdr);
1560                         break;
1561                 default:
1562                         return -EINVAL;
1563                 }
1564                 data = &val;
1565                 break;
1566         case PACKET_RESERVE:
1567                 if (len > sizeof(unsigned int))
1568                         len = sizeof(unsigned int);
1569                 val = po->tp_reserve;
1570                 data = &val;
1571                 break;
1572 #endif
1573         default:
1574                 return -ENOPROTOOPT;
1575         }
1576
1577         if (put_user(len, optlen))
1578                 return -EFAULT;
1579         if (copy_to_user(optval, data, len))
1580                 return -EFAULT;
1581         return 0;
1582 }
1583
1584
1585 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1586 {
1587         struct sock *sk;
1588         struct hlist_node *node;
1589         struct net_device *dev = data;
1590         struct net *net = dev_net(dev);
1591
1592         read_lock(&net->packet.sklist_lock);
1593         sk_for_each(sk, node, &net->packet.sklist) {
1594                 struct packet_sock *po = pkt_sk(sk);
1595
1596                 switch (msg) {
1597                 case NETDEV_UNREGISTER:
1598                         if (po->mclist)
1599                                 packet_dev_mclist(dev, po->mclist, -1);
1600                         /* fallthrough */
1601
1602                 case NETDEV_DOWN:
1603                         if (dev->ifindex == po->ifindex) {
1604                                 spin_lock(&po->bind_lock);
1605                                 if (po->running) {
1606                                         __dev_remove_pack(&po->prot_hook);
1607                                         __sock_put(sk);
1608                                         po->running = 0;
1609                                         sk->sk_err = ENETDOWN;
1610                                         if (!sock_flag(sk, SOCK_DEAD))
1611                                                 sk->sk_error_report(sk);
1612                                 }
1613                                 if (msg == NETDEV_UNREGISTER) {
1614                                         po->ifindex = -1;
1615                                         po->prot_hook.dev = NULL;
1616                                 }
1617                                 spin_unlock(&po->bind_lock);
1618                         }
1619                         break;
1620                 case NETDEV_UP:
1621                         spin_lock(&po->bind_lock);
1622                         if (dev->ifindex == po->ifindex && po->num &&
1623                             !po->running) {
1624                                 dev_add_pack(&po->prot_hook);
1625                                 sock_hold(sk);
1626                                 po->running = 1;
1627                         }
1628                         spin_unlock(&po->bind_lock);
1629                         break;
1630                 }
1631         }
1632         read_unlock(&net->packet.sklist_lock);
1633         return NOTIFY_DONE;
1634 }
1635
1636
1637 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1638                         unsigned long arg)
1639 {
1640         struct sock *sk = sock->sk;
1641
1642         switch(cmd) {
1643                 case SIOCOUTQ:
1644                 {
1645                         int amount = atomic_read(&sk->sk_wmem_alloc);
1646                         return put_user(amount, (int __user *)arg);
1647                 }
1648                 case SIOCINQ:
1649                 {
1650                         struct sk_buff *skb;
1651                         int amount = 0;
1652
1653                         spin_lock_bh(&sk->sk_receive_queue.lock);
1654                         skb = skb_peek(&sk->sk_receive_queue);
1655                         if (skb)
1656                                 amount = skb->len;
1657                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1658                         return put_user(amount, (int __user *)arg);
1659                 }
1660                 case SIOCGSTAMP:
1661                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1662                 case SIOCGSTAMPNS:
1663                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1664
1665 #ifdef CONFIG_INET
1666                 case SIOCADDRT:
1667                 case SIOCDELRT:
1668                 case SIOCDARP:
1669                 case SIOCGARP:
1670                 case SIOCSARP:
1671                 case SIOCGIFADDR:
1672                 case SIOCSIFADDR:
1673                 case SIOCGIFBRDADDR:
1674                 case SIOCSIFBRDADDR:
1675                 case SIOCGIFNETMASK:
1676                 case SIOCSIFNETMASK:
1677                 case SIOCGIFDSTADDR:
1678                 case SIOCSIFDSTADDR:
1679                 case SIOCSIFFLAGS:
1680                         if (!net_eq(sock_net(sk), &init_net))
1681                                 return -ENOIOCTLCMD;
1682                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1683 #endif
1684
1685                 default:
1686                         return -ENOIOCTLCMD;
1687         }
1688         return 0;
1689 }
1690
1691 #ifndef CONFIG_PACKET_MMAP
1692 #define packet_mmap sock_no_mmap
1693 #define packet_poll datagram_poll
1694 #else
1695
1696 static unsigned int packet_poll(struct file * file, struct socket *sock,
1697                                 poll_table *wait)
1698 {
1699         struct sock *sk = sock->sk;
1700         struct packet_sock *po = pkt_sk(sk);
1701         unsigned int mask = datagram_poll(file, sock, wait);
1702
1703         spin_lock_bh(&sk->sk_receive_queue.lock);
1704         if (po->pg_vec) {
1705                 unsigned last = po->head ? po->head-1 : po->frame_max;
1706
1707                 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1708                         mask |= POLLIN | POLLRDNORM;
1709         }
1710         spin_unlock_bh(&sk->sk_receive_queue.lock);
1711         return mask;
1712 }
1713
1714
1715 /* Dirty? Well, I still did not learn better way to account
1716  * for user mmaps.
1717  */
1718
1719 static void packet_mm_open(struct vm_area_struct *vma)
1720 {
1721         struct file *file = vma->vm_file;
1722         struct socket * sock = file->private_data;
1723         struct sock *sk = sock->sk;
1724
1725         if (sk)
1726                 atomic_inc(&pkt_sk(sk)->mapped);
1727 }
1728
1729 static void packet_mm_close(struct vm_area_struct *vma)
1730 {
1731         struct file *file = vma->vm_file;
1732         struct socket * sock = file->private_data;
1733         struct sock *sk = sock->sk;
1734
1735         if (sk)
1736                 atomic_dec(&pkt_sk(sk)->mapped);
1737 }
1738
1739 static struct vm_operations_struct packet_mmap_ops = {
1740         .open = packet_mm_open,
1741         .close =packet_mm_close,
1742 };
1743
1744 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1745 {
1746         int i;
1747
1748         for (i = 0; i < len; i++) {
1749                 if (likely(pg_vec[i]))
1750                         free_pages((unsigned long) pg_vec[i], order);
1751         }
1752         kfree(pg_vec);
1753 }
1754
1755 static inline char *alloc_one_pg_vec_page(unsigned long order)
1756 {
1757         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1758                                          order);
1759 }
1760
1761 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1762 {
1763         unsigned int block_nr = req->tp_block_nr;
1764         char **pg_vec;
1765         int i;
1766
1767         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1768         if (unlikely(!pg_vec))
1769                 goto out;
1770
1771         for (i = 0; i < block_nr; i++) {
1772                 pg_vec[i] = alloc_one_pg_vec_page(order);
1773                 if (unlikely(!pg_vec[i]))
1774                         goto out_free_pgvec;
1775         }
1776
1777 out:
1778         return pg_vec;
1779
1780 out_free_pgvec:
1781         free_pg_vec(pg_vec, order, block_nr);
1782         pg_vec = NULL;
1783         goto out;
1784 }
1785
1786 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1787 {
1788         char **pg_vec = NULL;
1789         struct packet_sock *po = pkt_sk(sk);
1790         int was_running, order = 0;
1791         __be16 num;
1792         int err = 0;
1793
1794         if (req->tp_block_nr) {
1795                 int i;
1796
1797                 /* Sanity tests and some calculations */
1798
1799                 if (unlikely(po->pg_vec))
1800                         return -EBUSY;
1801
1802                 switch (po->tp_version) {
1803                 case TPACKET_V1:
1804                         po->tp_hdrlen = TPACKET_HDRLEN;
1805                         break;
1806                 case TPACKET_V2:
1807                         po->tp_hdrlen = TPACKET2_HDRLEN;
1808                         break;
1809                 }
1810
1811                 if (unlikely((int)req->tp_block_size <= 0))
1812                         return -EINVAL;
1813                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1814                         return -EINVAL;
1815                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
1816                                                   po->tp_reserve))
1817                         return -EINVAL;
1818                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1819                         return -EINVAL;
1820
1821                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1822                 if (unlikely(po->frames_per_block <= 0))
1823                         return -EINVAL;
1824                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1825                              req->tp_frame_nr))
1826                         return -EINVAL;
1827
1828                 err = -ENOMEM;
1829                 order = get_order(req->tp_block_size);
1830                 pg_vec = alloc_pg_vec(req, order);
1831                 if (unlikely(!pg_vec))
1832                         goto out;
1833
1834                 for (i = 0; i < req->tp_block_nr; i++) {
1835                         void *ptr = pg_vec[i];
1836                         int k;
1837
1838                         for (k = 0; k < po->frames_per_block; k++) {
1839                                 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1840                                 ptr += req->tp_frame_size;
1841                         }
1842                 }
1843                 /* Done */
1844         } else {
1845                 if (unlikely(req->tp_frame_nr))
1846                         return -EINVAL;
1847         }
1848
1849         lock_sock(sk);
1850
1851         /* Detach socket from network */
1852         spin_lock(&po->bind_lock);
1853         was_running = po->running;
1854         num = po->num;
1855         if (was_running) {
1856                 __dev_remove_pack(&po->prot_hook);
1857                 po->num = 0;
1858                 po->running = 0;
1859                 __sock_put(sk);
1860         }
1861         spin_unlock(&po->bind_lock);
1862
1863         synchronize_net();
1864
1865         err = -EBUSY;
1866         if (closing || atomic_read(&po->mapped) == 0) {
1867                 err = 0;
1868 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1869
1870                 spin_lock_bh(&sk->sk_receive_queue.lock);
1871                 pg_vec = XC(po->pg_vec, pg_vec);
1872                 po->frame_max = (req->tp_frame_nr - 1);
1873                 po->head = 0;
1874                 po->frame_size = req->tp_frame_size;
1875                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1876
1877                 order = XC(po->pg_vec_order, order);
1878                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1879
1880                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1881                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1882                 skb_queue_purge(&sk->sk_receive_queue);
1883 #undef XC
1884                 if (atomic_read(&po->mapped))
1885                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1886         }
1887
1888         spin_lock(&po->bind_lock);
1889         if (was_running && !po->running) {
1890                 sock_hold(sk);
1891                 po->running = 1;
1892                 po->num = num;
1893                 dev_add_pack(&po->prot_hook);
1894         }
1895         spin_unlock(&po->bind_lock);
1896
1897         release_sock(sk);
1898
1899         if (pg_vec)
1900                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1901 out:
1902         return err;
1903 }
1904
1905 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1906 {
1907         struct sock *sk = sock->sk;
1908         struct packet_sock *po = pkt_sk(sk);
1909         unsigned long size;
1910         unsigned long start;
1911         int err = -EINVAL;
1912         int i;
1913
1914         if (vma->vm_pgoff)
1915                 return -EINVAL;
1916
1917         size = vma->vm_end - vma->vm_start;
1918
1919         lock_sock(sk);
1920         if (po->pg_vec == NULL)
1921                 goto out;
1922         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1923                 goto out;
1924
1925         start = vma->vm_start;
1926         for (i = 0; i < po->pg_vec_len; i++) {
1927                 struct page *page = virt_to_page(po->pg_vec[i]);
1928                 int pg_num;
1929
1930                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1931                         err = vm_insert_page(vma, start, page);
1932                         if (unlikely(err))
1933                                 goto out;
1934                         start += PAGE_SIZE;
1935                 }
1936         }
1937         atomic_inc(&po->mapped);
1938         vma->vm_ops = &packet_mmap_ops;
1939         err = 0;
1940
1941 out:
1942         release_sock(sk);
1943         return err;
1944 }
1945 #endif
1946
1947
1948 static const struct proto_ops packet_ops_spkt = {
1949         .family =       PF_PACKET,
1950         .owner =        THIS_MODULE,
1951         .release =      packet_release,
1952         .bind =         packet_bind_spkt,
1953         .connect =      sock_no_connect,
1954         .socketpair =   sock_no_socketpair,
1955         .accept =       sock_no_accept,
1956         .getname =      packet_getname_spkt,
1957         .poll =         datagram_poll,
1958         .ioctl =        packet_ioctl,
1959         .listen =       sock_no_listen,
1960         .shutdown =     sock_no_shutdown,
1961         .setsockopt =   sock_no_setsockopt,
1962         .getsockopt =   sock_no_getsockopt,
1963         .sendmsg =      packet_sendmsg_spkt,
1964         .recvmsg =      packet_recvmsg,
1965         .mmap =         sock_no_mmap,
1966         .sendpage =     sock_no_sendpage,
1967 };
1968
1969 static const struct proto_ops packet_ops = {
1970         .family =       PF_PACKET,
1971         .owner =        THIS_MODULE,
1972         .release =      packet_release,
1973         .bind =         packet_bind,
1974         .connect =      sock_no_connect,
1975         .socketpair =   sock_no_socketpair,
1976         .accept =       sock_no_accept,
1977         .getname =      packet_getname,
1978         .poll =         packet_poll,
1979         .ioctl =        packet_ioctl,
1980         .listen =       sock_no_listen,
1981         .shutdown =     sock_no_shutdown,
1982         .setsockopt =   packet_setsockopt,
1983         .getsockopt =   packet_getsockopt,
1984         .sendmsg =      packet_sendmsg,
1985         .recvmsg =      packet_recvmsg,
1986         .mmap =         packet_mmap,
1987         .sendpage =     sock_no_sendpage,
1988 };
1989
1990 static struct net_proto_family packet_family_ops = {
1991         .family =       PF_PACKET,
1992         .create =       packet_create,
1993         .owner  =       THIS_MODULE,
1994 };
1995
1996 static struct notifier_block packet_netdev_notifier = {
1997         .notifier_call =packet_notifier,
1998 };
1999
2000 #ifdef CONFIG_PROC_FS
2001 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2002 {
2003         struct sock *s;
2004         struct hlist_node *node;
2005
2006         sk_for_each(s, node, &net->packet.sklist) {
2007                 if (!off--)
2008                         return s;
2009         }
2010         return NULL;
2011 }
2012
2013 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2014         __acquires(seq_file_net(seq)->packet.sklist_lock)
2015 {
2016         struct net *net = seq_file_net(seq);
2017         read_lock(&net->packet.sklist_lock);
2018         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2019 }
2020
2021 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2022 {
2023         struct net *net = seq_file_net(seq);
2024         ++*pos;
2025         return  (v == SEQ_START_TOKEN)
2026                 ? sk_head(&net->packet.sklist)
2027                 : sk_next((struct sock*)v) ;
2028 }
2029
2030 static void packet_seq_stop(struct seq_file *seq, void *v)
2031         __releases(seq_file_net(seq)->packet.sklist_lock)
2032 {
2033         struct net *net = seq_file_net(seq);
2034         read_unlock(&net->packet.sklist_lock);
2035 }
2036
2037 static int packet_seq_show(struct seq_file *seq, void *v)
2038 {
2039         if (v == SEQ_START_TOKEN)
2040                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2041         else {
2042                 struct sock *s = v;
2043                 const struct packet_sock *po = pkt_sk(s);
2044
2045                 seq_printf(seq,
2046                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2047                            s,
2048                            atomic_read(&s->sk_refcnt),
2049                            s->sk_type,
2050                            ntohs(po->num),
2051                            po->ifindex,
2052                            po->running,
2053                            atomic_read(&s->sk_rmem_alloc),
2054                            sock_i_uid(s),
2055                            sock_i_ino(s) );
2056         }
2057
2058         return 0;
2059 }
2060
2061 static const struct seq_operations packet_seq_ops = {
2062         .start  = packet_seq_start,
2063         .next   = packet_seq_next,
2064         .stop   = packet_seq_stop,
2065         .show   = packet_seq_show,
2066 };
2067
2068 static int packet_seq_open(struct inode *inode, struct file *file)
2069 {
2070         return seq_open_net(inode, file, &packet_seq_ops,
2071                             sizeof(struct seq_net_private));
2072 }
2073
2074 static const struct file_operations packet_seq_fops = {
2075         .owner          = THIS_MODULE,
2076         .open           = packet_seq_open,
2077         .read           = seq_read,
2078         .llseek         = seq_lseek,
2079         .release        = seq_release_net,
2080 };
2081
2082 #endif
2083
2084 static int packet_net_init(struct net *net)
2085 {
2086         rwlock_init(&net->packet.sklist_lock);
2087         INIT_HLIST_HEAD(&net->packet.sklist);
2088
2089         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2090                 return -ENOMEM;
2091
2092         return 0;
2093 }
2094
2095 static void packet_net_exit(struct net *net)
2096 {
2097         proc_net_remove(net, "packet");
2098 }
2099
2100 static struct pernet_operations packet_net_ops = {
2101         .init = packet_net_init,
2102         .exit = packet_net_exit,
2103 };
2104
2105
2106 static void __exit packet_exit(void)
2107 {
2108         unregister_netdevice_notifier(&packet_netdev_notifier);
2109         unregister_pernet_subsys(&packet_net_ops);
2110         sock_unregister(PF_PACKET);
2111         proto_unregister(&packet_proto);
2112 }
2113
2114 static int __init packet_init(void)
2115 {
2116         int rc = proto_register(&packet_proto, 0);
2117
2118         if (rc != 0)
2119                 goto out;
2120
2121         sock_register(&packet_family_ops);
2122         register_pernet_subsys(&packet_net_ops);
2123         register_netdevice_notifier(&packet_netdev_notifier);
2124 out:
2125         return rc;
2126 }
2127
2128 module_init(packet_init);
2129 module_exit(packet_exit);
2130 MODULE_LICENSE("GPL");
2131 MODULE_ALIAS_NETPROTO(PF_PACKET);