[NETFILTER]: ctnetlink: fix reference count leak
[linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll 
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51  
52 #include <linux/types.h>
53 #include <linux/sched.h>
54 #include <linux/mm.h>
55 #include <linux/capability.h>
56 #include <linux/fcntl.h>
57 #include <linux/socket.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/if_packet.h>
62 #include <linux/wireless.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/io.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80
81 #ifdef CONFIG_INET
82 #include <net/inet_common.h>
83 #endif
84
85 #define CONFIG_SOCK_PACKET      1
86
87 /*
88    Proposed replacement for SIOC{ADD,DEL}MULTI and
89    IFF_PROMISC, IFF_ALLMULTI flags.
90
91    It is more expensive, but I believe,
92    it is really correct solution: reentereble, safe and fault tolerant.
93
94    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
95    reference count and global flag, so that real status is
96    (gflag|(count != 0)), so that we can use obsolete faulty interface
97    not harming clever users.
98  */
99 #define CONFIG_PACKET_MULTICAST 1
100
101 /*
102    Assumptions:
103    - if device has no dev->hard_header routine, it adds and removes ll header
104      inside itself. In this case ll header is invisible outside of device,
105      but higher levels still should reserve dev->hard_header_len.
106      Some devices are enough clever to reallocate skb, when header
107      will not fit to reserved space (tunnel), another ones are silly
108      (PPP).
109    - packet socket receives packets with pulled ll header,
110      so that SOCK_RAW should push it back.
111
112 On receive:
113 -----------
114
115 Incoming, dev->hard_header!=NULL
116    mac.raw -> ll header
117    data    -> data
118
119 Outgoing, dev->hard_header!=NULL
120    mac.raw -> ll header
121    data    -> ll header
122
123 Incoming, dev->hard_header==NULL
124    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
125               PPP makes it, that is wrong, because introduce assymetry
126               between rx and tx paths.
127    data    -> data
128
129 Outgoing, dev->hard_header==NULL
130    mac.raw -> data. ll header is still not built!
131    data    -> data
132
133 Resume
134   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
135
136
137 On transmit:
138 ------------
139
140 dev->hard_header != NULL
141    mac.raw -> ll header
142    data    -> ll header
143
144 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145    mac.raw -> data
146    data -> data
147
148    We should set nh.raw on output to correct posistion,
149    packet classifier depends on it.
150  */
151
152 /* List of all packet sockets. */
153 static HLIST_HEAD(packet_sklist);
154 static DEFINE_RWLOCK(packet_sklist_lock);
155
156 static atomic_t packet_socks_nr;
157
158
159 /* Private packet socket structures. */
160
161 #ifdef CONFIG_PACKET_MULTICAST
162 struct packet_mclist
163 {
164         struct packet_mclist    *next;
165         int                     ifindex;
166         int                     count;
167         unsigned short          type;
168         unsigned short          alen;
169         unsigned char           addr[MAX_ADDR_LEN];
170 };
171 /* identical to struct packet_mreq except it has
172  * a longer address field.
173  */
174 struct packet_mreq_max
175 {
176         int             mr_ifindex;
177         unsigned short  mr_type;
178         unsigned short  mr_alen;
179         unsigned char   mr_address[MAX_ADDR_LEN];
180 };
181 #endif
182 #ifdef CONFIG_PACKET_MMAP
183 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
184 #endif
185
186 static void packet_flush_mclist(struct sock *sk);
187
188 struct packet_sock {
189         /* struct sock has to be the first member of packet_sock */
190         struct sock             sk;
191         struct tpacket_stats    stats;
192 #ifdef CONFIG_PACKET_MMAP
193         char *                  *pg_vec;
194         unsigned int            head;
195         unsigned int            frames_per_block;
196         unsigned int            frame_size;
197         unsigned int            frame_max;
198         int                     copy_thresh;
199 #endif
200         struct packet_type      prot_hook;
201         spinlock_t              bind_lock;
202         char                    running;        /* prot_hook is attached*/
203         int                     ifindex;        /* bound device         */
204         unsigned short          num;
205 #ifdef CONFIG_PACKET_MULTICAST
206         struct packet_mclist    *mclist;
207 #endif
208 #ifdef CONFIG_PACKET_MMAP
209         atomic_t                mapped;
210         unsigned int            pg_vec_order;
211         unsigned int            pg_vec_pages;
212         unsigned int            pg_vec_len;
213 #endif
214 };
215
216 #ifdef CONFIG_PACKET_MMAP
217
218 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
219 {
220         unsigned int pg_vec_pos, frame_offset;
221         char *frame;
222
223         pg_vec_pos = position / po->frames_per_block;
224         frame_offset = position % po->frames_per_block;
225
226         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
227         
228         return frame;
229 }
230 #endif
231
232 static inline struct packet_sock *pkt_sk(struct sock *sk)
233 {
234         return (struct packet_sock *)sk;
235 }
236
237 static void packet_sock_destruct(struct sock *sk)
238 {
239         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
240         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
241
242         if (!sock_flag(sk, SOCK_DEAD)) {
243                 printk("Attempt to release alive packet socket: %p\n", sk);
244                 return;
245         }
246
247         atomic_dec(&packet_socks_nr);
248 #ifdef PACKET_REFCNT_DEBUG
249         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
250 #endif
251 }
252
253
254 static const struct proto_ops packet_ops;
255
256 #ifdef CONFIG_SOCK_PACKET
257 static const struct proto_ops packet_ops_spkt;
258
259 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
260 {
261         struct sock *sk;
262         struct sockaddr_pkt *spkt;
263
264         /*
265          *      When we registered the protocol we saved the socket in the data
266          *      field for just this event.
267          */
268
269         sk = pt->af_packet_priv;
270         
271         /*
272          *      Yank back the headers [hope the device set this
273          *      right or kerboom...]
274          *
275          *      Incoming packets have ll header pulled,
276          *      push it back.
277          *
278          *      For outgoing ones skb->data == skb->mac.raw
279          *      so that this procedure is noop.
280          */
281
282         if (skb->pkt_type == PACKET_LOOPBACK)
283                 goto out;
284
285         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
286                 goto oom;
287
288         /* drop any routing info */
289         dst_release(skb->dst);
290         skb->dst = NULL;
291
292         /* drop conntrack reference */
293         nf_reset(skb);
294
295         spkt = (struct sockaddr_pkt*)skb->cb;
296
297         skb_push(skb, skb->data-skb->mac.raw);
298
299         /*
300          *      The SOCK_PACKET socket receives _all_ frames.
301          */
302
303         spkt->spkt_family = dev->type;
304         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
305         spkt->spkt_protocol = skb->protocol;
306
307         /*
308          *      Charge the memory to the socket. This is done specifically
309          *      to prevent sockets using all the memory up.
310          */
311
312         if (sock_queue_rcv_skb(sk,skb) == 0)
313                 return 0;
314
315 out:
316         kfree_skb(skb);
317 oom:
318         return 0;
319 }
320
321
322 /*
323  *      Output a raw packet to a device layer. This bypasses all the other
324  *      protocol layers and you must therefore supply it with a complete frame
325  */
326  
327 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
328                                struct msghdr *msg, size_t len)
329 {
330         struct sock *sk = sock->sk;
331         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
332         struct sk_buff *skb;
333         struct net_device *dev;
334         unsigned short proto=0;
335         int err;
336         
337         /*
338          *      Get and verify the address. 
339          */
340
341         if (saddr)
342         {
343                 if (msg->msg_namelen < sizeof(struct sockaddr))
344                         return(-EINVAL);
345                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
346                         proto=saddr->spkt_protocol;
347         }
348         else
349                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
350
351         /*
352          *      Find the device first to size check it 
353          */
354
355         saddr->spkt_device[13] = 0;
356         dev = dev_get_by_name(saddr->spkt_device);
357         err = -ENODEV;
358         if (dev == NULL)
359                 goto out_unlock;
360         
361         /*
362          *      You may not queue a frame bigger than the mtu. This is the lowest level
363          *      raw protocol and you must do your own fragmentation at this level.
364          */
365          
366         err = -EMSGSIZE;
367         if (len > dev->mtu + dev->hard_header_len)
368                 goto out_unlock;
369
370         err = -ENOBUFS;
371         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
372
373         /*
374          *      If the write buffer is full, then tough. At this level the user gets to
375          *      deal with the problem - do your own algorithmic backoffs. That's far
376          *      more flexible.
377          */
378          
379         if (skb == NULL) 
380                 goto out_unlock;
381
382         /*
383          *      Fill it in 
384          */
385          
386         /* FIXME: Save some space for broken drivers that write a
387          * hard header at transmission time by themselves. PPP is the
388          * notable one here. This should really be fixed at the driver level.
389          */
390         skb_reserve(skb, LL_RESERVED_SPACE(dev));
391         skb->nh.raw = skb->data;
392
393         /* Try to align data part correctly */
394         if (dev->hard_header) {
395                 skb->data -= dev->hard_header_len;
396                 skb->tail -= dev->hard_header_len;
397                 if (len < dev->hard_header_len)
398                         skb->nh.raw = skb->data;
399         }
400
401         /* Returns -EFAULT on error */
402         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
403         skb->protocol = proto;
404         skb->dev = dev;
405         skb->priority = sk->sk_priority;
406         if (err)
407                 goto out_free;
408
409         err = -ENETDOWN;
410         if (!(dev->flags & IFF_UP))
411                 goto out_free;
412
413         /*
414          *      Now send it
415          */
416
417         dev_queue_xmit(skb);
418         dev_put(dev);
419         return(len);
420
421 out_free:
422         kfree_skb(skb);
423 out_unlock:
424         if (dev)
425                 dev_put(dev);
426         return err;
427 }
428 #endif
429
430 static inline int run_filter(struct sk_buff *skb, struct sock *sk,
431                                                         unsigned *snaplen)
432 {
433         struct sk_filter *filter;
434         int err = 0;
435
436         rcu_read_lock_bh();
437         filter = rcu_dereference(sk->sk_filter);
438         if (filter != NULL) {
439                 err = sk_run_filter(skb, filter->insns, filter->len);
440                 if (!err)
441                         err = -EPERM;
442                 else if (*snaplen > err)
443                         *snaplen = err;
444         }
445         rcu_read_unlock_bh();
446
447         return err;
448 }
449
450 /*
451    This function makes lazy skb cloning in hope that most of packets
452    are discarded by BPF.
453
454    Note tricky part: we DO mangle shared skb! skb->data, skb->len
455    and skb->cb are mangled. It works because (and until) packets
456    falling here are owned by current CPU. Output packets are cloned
457    by dev_queue_xmit_nit(), input packets are processed by net_bh
458    sequencially, so that if we return skb to original state on exit,
459    we will not harm anyone.
460  */
461
462 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
463 {
464         struct sock *sk;
465         struct sockaddr_ll *sll;
466         struct packet_sock *po;
467         u8 * skb_head = skb->data;
468         int skb_len = skb->len;
469         unsigned snaplen;
470
471         if (skb->pkt_type == PACKET_LOOPBACK)
472                 goto drop;
473
474         sk = pt->af_packet_priv;
475         po = pkt_sk(sk);
476
477         skb->dev = dev;
478
479         if (dev->hard_header) {
480                 /* The device has an explicit notion of ll header,
481                    exported to higher levels.
482
483                    Otherwise, the device hides datails of it frame
484                    structure, so that corresponding packet head
485                    never delivered to user.
486                  */
487                 if (sk->sk_type != SOCK_DGRAM)
488                         skb_push(skb, skb->data - skb->mac.raw);
489                 else if (skb->pkt_type == PACKET_OUTGOING) {
490                         /* Special case: outgoing packets have ll header at head */
491                         skb_pull(skb, skb->nh.raw - skb->data);
492                 }
493         }
494
495         snaplen = skb->len;
496
497         if (run_filter(skb, sk, &snaplen) < 0)
498                 goto drop_n_restore;
499
500         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
501             (unsigned)sk->sk_rcvbuf)
502                 goto drop_n_acct;
503
504         if (skb_shared(skb)) {
505                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
506                 if (nskb == NULL)
507                         goto drop_n_acct;
508
509                 if (skb_head != skb->data) {
510                         skb->data = skb_head;
511                         skb->len = skb_len;
512                 }
513                 kfree_skb(skb);
514                 skb = nskb;
515         }
516
517         sll = (struct sockaddr_ll*)skb->cb;
518         sll->sll_family = AF_PACKET;
519         sll->sll_hatype = dev->type;
520         sll->sll_protocol = skb->protocol;
521         sll->sll_pkttype = skb->pkt_type;
522         sll->sll_ifindex = dev->ifindex;
523         sll->sll_halen = 0;
524
525         if (dev->hard_header_parse)
526                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
527
528         if (pskb_trim(skb, snaplen))
529                 goto drop_n_acct;
530
531         skb_set_owner_r(skb, sk);
532         skb->dev = NULL;
533         dst_release(skb->dst);
534         skb->dst = NULL;
535
536         /* drop conntrack reference */
537         nf_reset(skb);
538
539         spin_lock(&sk->sk_receive_queue.lock);
540         po->stats.tp_packets++;
541         __skb_queue_tail(&sk->sk_receive_queue, skb);
542         spin_unlock(&sk->sk_receive_queue.lock);
543         sk->sk_data_ready(sk, skb->len);
544         return 0;
545
546 drop_n_acct:
547         spin_lock(&sk->sk_receive_queue.lock);
548         po->stats.tp_drops++;
549         spin_unlock(&sk->sk_receive_queue.lock);
550
551 drop_n_restore:
552         if (skb_head != skb->data && skb_shared(skb)) {
553                 skb->data = skb_head;
554                 skb->len = skb_len;
555         }
556 drop:
557         kfree_skb(skb);
558         return 0;
559 }
560
561 #ifdef CONFIG_PACKET_MMAP
562 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
563 {
564         struct sock *sk;
565         struct packet_sock *po;
566         struct sockaddr_ll *sll;
567         struct tpacket_hdr *h;
568         u8 * skb_head = skb->data;
569         int skb_len = skb->len;
570         unsigned snaplen;
571         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
572         unsigned short macoff, netoff;
573         struct sk_buff *copy_skb = NULL;
574
575         if (skb->pkt_type == PACKET_LOOPBACK)
576                 goto drop;
577
578         sk = pt->af_packet_priv;
579         po = pkt_sk(sk);
580
581         if (dev->hard_header) {
582                 if (sk->sk_type != SOCK_DGRAM)
583                         skb_push(skb, skb->data - skb->mac.raw);
584                 else if (skb->pkt_type == PACKET_OUTGOING) {
585                         /* Special case: outgoing packets have ll header at head */
586                         skb_pull(skb, skb->nh.raw - skb->data);
587                         if (skb->ip_summed == CHECKSUM_PARTIAL)
588                                 status |= TP_STATUS_CSUMNOTREADY;
589                 }
590         }
591
592         snaplen = skb->len;
593
594         if (run_filter(skb, sk, &snaplen) < 0)
595                 goto drop_n_restore;
596
597         if (sk->sk_type == SOCK_DGRAM) {
598                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
599         } else {
600                 unsigned maclen = skb->nh.raw - skb->data;
601                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
602                 macoff = netoff - maclen;
603         }
604
605         if (macoff + snaplen > po->frame_size) {
606                 if (po->copy_thresh &&
607                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
608                     (unsigned)sk->sk_rcvbuf) {
609                         if (skb_shared(skb)) {
610                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
611                         } else {
612                                 copy_skb = skb_get(skb);
613                                 skb_head = skb->data;
614                         }
615                         if (copy_skb)
616                                 skb_set_owner_r(copy_skb, sk);
617                 }
618                 snaplen = po->frame_size - macoff;
619                 if ((int)snaplen < 0)
620                         snaplen = 0;
621         }
622
623         spin_lock(&sk->sk_receive_queue.lock);
624         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
625         
626         if (h->tp_status)
627                 goto ring_is_full;
628         po->head = po->head != po->frame_max ? po->head+1 : 0;
629         po->stats.tp_packets++;
630         if (copy_skb) {
631                 status |= TP_STATUS_COPY;
632                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
633         }
634         if (!po->stats.tp_drops)
635                 status &= ~TP_STATUS_LOSING;
636         spin_unlock(&sk->sk_receive_queue.lock);
637
638         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
639
640         h->tp_len = skb->len;
641         h->tp_snaplen = snaplen;
642         h->tp_mac = macoff;
643         h->tp_net = netoff;
644         if (skb->tstamp.off_sec == 0) { 
645                 __net_timestamp(skb);
646                 sock_enable_timestamp(sk);
647         }
648         h->tp_sec = skb->tstamp.off_sec;
649         h->tp_usec = skb->tstamp.off_usec;
650
651         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
652         sll->sll_halen = 0;
653         if (dev->hard_header_parse)
654                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
655         sll->sll_family = AF_PACKET;
656         sll->sll_hatype = dev->type;
657         sll->sll_protocol = skb->protocol;
658         sll->sll_pkttype = skb->pkt_type;
659         sll->sll_ifindex = dev->ifindex;
660
661         h->tp_status = status;
662         mb();
663
664         {
665                 struct page *p_start, *p_end;
666                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
667
668                 p_start = virt_to_page(h);
669                 p_end = virt_to_page(h_end);
670                 while (p_start <= p_end) {
671                         flush_dcache_page(p_start);
672                         p_start++;
673                 }
674         }
675
676         sk->sk_data_ready(sk, 0);
677
678 drop_n_restore:
679         if (skb_head != skb->data && skb_shared(skb)) {
680                 skb->data = skb_head;
681                 skb->len = skb_len;
682         }
683 drop:
684         kfree_skb(skb);
685         return 0;
686
687 ring_is_full:
688         po->stats.tp_drops++;
689         spin_unlock(&sk->sk_receive_queue.lock);
690
691         sk->sk_data_ready(sk, 0);
692         if (copy_skb)
693                 kfree_skb(copy_skb);
694         goto drop_n_restore;
695 }
696
697 #endif
698
699
700 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
701                           struct msghdr *msg, size_t len)
702 {
703         struct sock *sk = sock->sk;
704         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
705         struct sk_buff *skb;
706         struct net_device *dev;
707         unsigned short proto;
708         unsigned char *addr;
709         int ifindex, err, reserve = 0;
710
711         /*
712          *      Get and verify the address. 
713          */
714          
715         if (saddr == NULL) {
716                 struct packet_sock *po = pkt_sk(sk);
717
718                 ifindex = po->ifindex;
719                 proto   = po->num;
720                 addr    = NULL;
721         } else {
722                 err = -EINVAL;
723                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
724                         goto out;
725                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
726                         goto out;
727                 ifindex = saddr->sll_ifindex;
728                 proto   = saddr->sll_protocol;
729                 addr    = saddr->sll_addr;
730         }
731
732
733         dev = dev_get_by_index(ifindex);
734         err = -ENXIO;
735         if (dev == NULL)
736                 goto out_unlock;
737         if (sock->type == SOCK_RAW)
738                 reserve = dev->hard_header_len;
739
740         err = -EMSGSIZE;
741         if (len > dev->mtu+reserve)
742                 goto out_unlock;
743
744         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
745                                 msg->msg_flags & MSG_DONTWAIT, &err);
746         if (skb==NULL)
747                 goto out_unlock;
748
749         skb_reserve(skb, LL_RESERVED_SPACE(dev));
750         skb->nh.raw = skb->data;
751
752         if (dev->hard_header) {
753                 int res;
754                 err = -EINVAL;
755                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
756                 if (sock->type != SOCK_DGRAM) {
757                         skb->tail = skb->data;
758                         skb->len = 0;
759                 } else if (res < 0)
760                         goto out_free;
761         }
762
763         /* Returns -EFAULT on error */
764         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
765         if (err)
766                 goto out_free;
767
768         skb->protocol = proto;
769         skb->dev = dev;
770         skb->priority = sk->sk_priority;
771
772         err = -ENETDOWN;
773         if (!(dev->flags & IFF_UP))
774                 goto out_free;
775
776         /*
777          *      Now send it
778          */
779
780         err = dev_queue_xmit(skb);
781         if (err > 0 && (err = net_xmit_errno(err)) != 0)
782                 goto out_unlock;
783
784         dev_put(dev);
785
786         return(len);
787
788 out_free:
789         kfree_skb(skb);
790 out_unlock:
791         if (dev)
792                 dev_put(dev);
793 out:
794         return err;
795 }
796
797 /*
798  *      Close a PACKET socket. This is fairly simple. We immediately go
799  *      to 'closed' state and remove our protocol entry in the device list.
800  */
801
802 static int packet_release(struct socket *sock)
803 {
804         struct sock *sk = sock->sk;
805         struct packet_sock *po;
806
807         if (!sk)
808                 return 0;
809
810         po = pkt_sk(sk);
811
812         write_lock_bh(&packet_sklist_lock);
813         sk_del_node_init(sk);
814         write_unlock_bh(&packet_sklist_lock);
815
816         /*
817          *      Unhook packet receive handler.
818          */
819
820         if (po->running) {
821                 /*
822                  *      Remove the protocol hook
823                  */
824                 dev_remove_pack(&po->prot_hook);
825                 po->running = 0;
826                 po->num = 0;
827                 __sock_put(sk);
828         }
829
830 #ifdef CONFIG_PACKET_MULTICAST
831         packet_flush_mclist(sk);
832 #endif
833
834 #ifdef CONFIG_PACKET_MMAP
835         if (po->pg_vec) {
836                 struct tpacket_req req;
837                 memset(&req, 0, sizeof(req));
838                 packet_set_ring(sk, &req, 1);
839         }
840 #endif
841
842         /*
843          *      Now the socket is dead. No more input will appear.
844          */
845
846         sock_orphan(sk);
847         sock->sk = NULL;
848
849         /* Purge queues */
850
851         skb_queue_purge(&sk->sk_receive_queue);
852
853         sock_put(sk);
854         return 0;
855 }
856
857 /*
858  *      Attach a packet hook.
859  */
860
861 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
862 {
863         struct packet_sock *po = pkt_sk(sk);
864         /*
865          *      Detach an existing hook if present.
866          */
867
868         lock_sock(sk);
869
870         spin_lock(&po->bind_lock);
871         if (po->running) {
872                 __sock_put(sk);
873                 po->running = 0;
874                 po->num = 0;
875                 spin_unlock(&po->bind_lock);
876                 dev_remove_pack(&po->prot_hook);
877                 spin_lock(&po->bind_lock);
878         }
879
880         po->num = protocol;
881         po->prot_hook.type = protocol;
882         po->prot_hook.dev = dev;
883
884         po->ifindex = dev ? dev->ifindex : 0;
885
886         if (protocol == 0)
887                 goto out_unlock;
888
889         if (dev) {
890                 if (dev->flags&IFF_UP) {
891                         dev_add_pack(&po->prot_hook);
892                         sock_hold(sk);
893                         po->running = 1;
894                 } else {
895                         sk->sk_err = ENETDOWN;
896                         if (!sock_flag(sk, SOCK_DEAD))
897                                 sk->sk_error_report(sk);
898                 }
899         } else {
900                 dev_add_pack(&po->prot_hook);
901                 sock_hold(sk);
902                 po->running = 1;
903         }
904
905 out_unlock:
906         spin_unlock(&po->bind_lock);
907         release_sock(sk);
908         return 0;
909 }
910
911 /*
912  *      Bind a packet socket to a device
913  */
914
915 #ifdef CONFIG_SOCK_PACKET
916
917 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
918 {
919         struct sock *sk=sock->sk;
920         char name[15];
921         struct net_device *dev;
922         int err = -ENODEV;
923         
924         /*
925          *      Check legality
926          */
927          
928         if (addr_len != sizeof(struct sockaddr))
929                 return -EINVAL;
930         strlcpy(name,uaddr->sa_data,sizeof(name));
931
932         dev = dev_get_by_name(name);
933         if (dev) {
934                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
935                 dev_put(dev);
936         }
937         return err;
938 }
939 #endif
940
941 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
942 {
943         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
944         struct sock *sk=sock->sk;
945         struct net_device *dev = NULL;
946         int err;
947
948
949         /*
950          *      Check legality
951          */
952          
953         if (addr_len < sizeof(struct sockaddr_ll))
954                 return -EINVAL;
955         if (sll->sll_family != AF_PACKET)
956                 return -EINVAL;
957
958         if (sll->sll_ifindex) {
959                 err = -ENODEV;
960                 dev = dev_get_by_index(sll->sll_ifindex);
961                 if (dev == NULL)
962                         goto out;
963         }
964         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
965         if (dev)
966                 dev_put(dev);
967
968 out:
969         return err;
970 }
971
972 static struct proto packet_proto = {
973         .name     = "PACKET",
974         .owner    = THIS_MODULE,
975         .obj_size = sizeof(struct packet_sock),
976 };
977
978 /*
979  *      Create a packet of type SOCK_PACKET. 
980  */
981
982 static int packet_create(struct socket *sock, int protocol)
983 {
984         struct sock *sk;
985         struct packet_sock *po;
986         int err;
987
988         if (!capable(CAP_NET_RAW))
989                 return -EPERM;
990         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
991 #ifdef CONFIG_SOCK_PACKET
992             && sock->type != SOCK_PACKET
993 #endif
994             )
995                 return -ESOCKTNOSUPPORT;
996
997         sock->state = SS_UNCONNECTED;
998
999         err = -ENOBUFS;
1000         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1001         if (sk == NULL)
1002                 goto out;
1003
1004         sock->ops = &packet_ops;
1005 #ifdef CONFIG_SOCK_PACKET
1006         if (sock->type == SOCK_PACKET)
1007                 sock->ops = &packet_ops_spkt;
1008 #endif
1009         sock_init_data(sock, sk);
1010
1011         po = pkt_sk(sk);
1012         sk->sk_family = PF_PACKET;
1013         po->num = protocol;
1014
1015         sk->sk_destruct = packet_sock_destruct;
1016         atomic_inc(&packet_socks_nr);
1017
1018         /*
1019          *      Attach a protocol block
1020          */
1021
1022         spin_lock_init(&po->bind_lock);
1023         po->prot_hook.func = packet_rcv;
1024 #ifdef CONFIG_SOCK_PACKET
1025         if (sock->type == SOCK_PACKET)
1026                 po->prot_hook.func = packet_rcv_spkt;
1027 #endif
1028         po->prot_hook.af_packet_priv = sk;
1029
1030         if (protocol) {
1031                 po->prot_hook.type = protocol;
1032                 dev_add_pack(&po->prot_hook);
1033                 sock_hold(sk);
1034                 po->running = 1;
1035         }
1036
1037         write_lock_bh(&packet_sklist_lock);
1038         sk_add_node(sk, &packet_sklist);
1039         write_unlock_bh(&packet_sklist_lock);
1040         return(0);
1041 out:
1042         return err;
1043 }
1044
1045 /*
1046  *      Pull a packet from our receive queue and hand it to the user.
1047  *      If necessary we block.
1048  */
1049
1050 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1051                           struct msghdr *msg, size_t len, int flags)
1052 {
1053         struct sock *sk = sock->sk;
1054         struct sk_buff *skb;
1055         int copied, err;
1056         struct sockaddr_ll *sll;
1057
1058         err = -EINVAL;
1059         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1060                 goto out;
1061
1062 #if 0
1063         /* What error should we return now? EUNATTACH? */
1064         if (pkt_sk(sk)->ifindex < 0)
1065                 return -ENODEV;
1066 #endif
1067
1068         /*
1069          *      Call the generic datagram receiver. This handles all sorts
1070          *      of horrible races and re-entrancy so we can forget about it
1071          *      in the protocol layers.
1072          *
1073          *      Now it will return ENETDOWN, if device have just gone down,
1074          *      but then it will block.
1075          */
1076
1077         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1078
1079         /*
1080          *      An error occurred so return it. Because skb_recv_datagram() 
1081          *      handles the blocking we don't see and worry about blocking
1082          *      retries.
1083          */
1084
1085         if (skb == NULL)
1086                 goto out;
1087
1088         /*
1089          *      If the address length field is there to be filled in, we fill
1090          *      it in now.
1091          */
1092
1093         sll = (struct sockaddr_ll*)skb->cb;
1094         if (sock->type == SOCK_PACKET)
1095                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1096         else
1097                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1098
1099         /*
1100          *      You lose any data beyond the buffer you gave. If it worries a
1101          *      user program they can ask the device for its MTU anyway.
1102          */
1103
1104         copied = skb->len;
1105         if (copied > len)
1106         {
1107                 copied=len;
1108                 msg->msg_flags|=MSG_TRUNC;
1109         }
1110
1111         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1112         if (err)
1113                 goto out_free;
1114
1115         sock_recv_timestamp(msg, sk, skb);
1116
1117         if (msg->msg_name)
1118                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1119
1120         /*
1121          *      Free or return the buffer as appropriate. Again this
1122          *      hides all the races and re-entrancy issues from us.
1123          */
1124         err = (flags&MSG_TRUNC) ? skb->len : copied;
1125
1126 out_free:
1127         skb_free_datagram(sk, skb);
1128 out:
1129         return err;
1130 }
1131
1132 #ifdef CONFIG_SOCK_PACKET
1133 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1134                                int *uaddr_len, int peer)
1135 {
1136         struct net_device *dev;
1137         struct sock *sk = sock->sk;
1138
1139         if (peer)
1140                 return -EOPNOTSUPP;
1141
1142         uaddr->sa_family = AF_PACKET;
1143         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1144         if (dev) {
1145                 strlcpy(uaddr->sa_data, dev->name, 15);
1146                 dev_put(dev);
1147         } else
1148                 memset(uaddr->sa_data, 0, 14);
1149         *uaddr_len = sizeof(*uaddr);
1150
1151         return 0;
1152 }
1153 #endif
1154
1155 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1156                           int *uaddr_len, int peer)
1157 {
1158         struct net_device *dev;
1159         struct sock *sk = sock->sk;
1160         struct packet_sock *po = pkt_sk(sk);
1161         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1162
1163         if (peer)
1164                 return -EOPNOTSUPP;
1165
1166         sll->sll_family = AF_PACKET;
1167         sll->sll_ifindex = po->ifindex;
1168         sll->sll_protocol = po->num;
1169         dev = dev_get_by_index(po->ifindex);
1170         if (dev) {
1171                 sll->sll_hatype = dev->type;
1172                 sll->sll_halen = dev->addr_len;
1173                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1174                 dev_put(dev);
1175         } else {
1176                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1177                 sll->sll_halen = 0;
1178         }
1179         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1180
1181         return 0;
1182 }
1183
1184 #ifdef CONFIG_PACKET_MULTICAST
1185 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1186 {
1187         switch (i->type) {
1188         case PACKET_MR_MULTICAST:
1189                 if (what > 0)
1190                         dev_mc_add(dev, i->addr, i->alen, 0);
1191                 else
1192                         dev_mc_delete(dev, i->addr, i->alen, 0);
1193                 break;
1194         case PACKET_MR_PROMISC:
1195                 dev_set_promiscuity(dev, what);
1196                 break;
1197         case PACKET_MR_ALLMULTI:
1198                 dev_set_allmulti(dev, what);
1199                 break;
1200         default:;
1201         }
1202 }
1203
1204 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1205 {
1206         for ( ; i; i=i->next) {
1207                 if (i->ifindex == dev->ifindex)
1208                         packet_dev_mc(dev, i, what);
1209         }
1210 }
1211
1212 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1213 {
1214         struct packet_sock *po = pkt_sk(sk);
1215         struct packet_mclist *ml, *i;
1216         struct net_device *dev;
1217         int err;
1218
1219         rtnl_lock();
1220
1221         err = -ENODEV;
1222         dev = __dev_get_by_index(mreq->mr_ifindex);
1223         if (!dev)
1224                 goto done;
1225
1226         err = -EINVAL;
1227         if (mreq->mr_alen > dev->addr_len)
1228                 goto done;
1229
1230         err = -ENOBUFS;
1231         i = kmalloc(sizeof(*i), GFP_KERNEL);
1232         if (i == NULL)
1233                 goto done;
1234
1235         err = 0;
1236         for (ml = po->mclist; ml; ml = ml->next) {
1237                 if (ml->ifindex == mreq->mr_ifindex &&
1238                     ml->type == mreq->mr_type &&
1239                     ml->alen == mreq->mr_alen &&
1240                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1241                         ml->count++;
1242                         /* Free the new element ... */
1243                         kfree(i);
1244                         goto done;
1245                 }
1246         }
1247
1248         i->type = mreq->mr_type;
1249         i->ifindex = mreq->mr_ifindex;
1250         i->alen = mreq->mr_alen;
1251         memcpy(i->addr, mreq->mr_address, i->alen);
1252         i->count = 1;
1253         i->next = po->mclist;
1254         po->mclist = i;
1255         packet_dev_mc(dev, i, +1);
1256
1257 done:
1258         rtnl_unlock();
1259         return err;
1260 }
1261
1262 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1263 {
1264         struct packet_mclist *ml, **mlp;
1265
1266         rtnl_lock();
1267
1268         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1269                 if (ml->ifindex == mreq->mr_ifindex &&
1270                     ml->type == mreq->mr_type &&
1271                     ml->alen == mreq->mr_alen &&
1272                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1273                         if (--ml->count == 0) {
1274                                 struct net_device *dev;
1275                                 *mlp = ml->next;
1276                                 dev = dev_get_by_index(ml->ifindex);
1277                                 if (dev) {
1278                                         packet_dev_mc(dev, ml, -1);
1279                                         dev_put(dev);
1280                                 }
1281                                 kfree(ml);
1282                         }
1283                         rtnl_unlock();
1284                         return 0;
1285                 }
1286         }
1287         rtnl_unlock();
1288         return -EADDRNOTAVAIL;
1289 }
1290
1291 static void packet_flush_mclist(struct sock *sk)
1292 {
1293         struct packet_sock *po = pkt_sk(sk);
1294         struct packet_mclist *ml;
1295
1296         if (!po->mclist)
1297                 return;
1298
1299         rtnl_lock();
1300         while ((ml = po->mclist) != NULL) {
1301                 struct net_device *dev;
1302
1303                 po->mclist = ml->next;
1304                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1305                         packet_dev_mc(dev, ml, -1);
1306                         dev_put(dev);
1307                 }
1308                 kfree(ml);
1309         }
1310         rtnl_unlock();
1311 }
1312 #endif
1313
1314 static int
1315 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1316 {
1317         struct sock *sk = sock->sk;
1318         int ret;
1319
1320         if (level != SOL_PACKET)
1321                 return -ENOPROTOOPT;
1322
1323         switch(optname) {
1324 #ifdef CONFIG_PACKET_MULTICAST
1325         case PACKET_ADD_MEMBERSHIP:     
1326         case PACKET_DROP_MEMBERSHIP:
1327         {
1328                 struct packet_mreq_max mreq;
1329                 int len = optlen;
1330                 memset(&mreq, 0, sizeof(mreq));
1331                 if (len < sizeof(struct packet_mreq))
1332                         return -EINVAL;
1333                 if (len > sizeof(mreq))
1334                         len = sizeof(mreq);
1335                 if (copy_from_user(&mreq,optval,len))
1336                         return -EFAULT;
1337                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1338                         return -EINVAL;
1339                 if (optname == PACKET_ADD_MEMBERSHIP)
1340                         ret = packet_mc_add(sk, &mreq);
1341                 else
1342                         ret = packet_mc_drop(sk, &mreq);
1343                 return ret;
1344         }
1345 #endif
1346 #ifdef CONFIG_PACKET_MMAP
1347         case PACKET_RX_RING:
1348         {
1349                 struct tpacket_req req;
1350
1351                 if (optlen<sizeof(req))
1352                         return -EINVAL;
1353                 if (copy_from_user(&req,optval,sizeof(req)))
1354                         return -EFAULT;
1355                 return packet_set_ring(sk, &req, 0);
1356         }
1357         case PACKET_COPY_THRESH:
1358         {
1359                 int val;
1360
1361                 if (optlen!=sizeof(val))
1362                         return -EINVAL;
1363                 if (copy_from_user(&val,optval,sizeof(val)))
1364                         return -EFAULT;
1365
1366                 pkt_sk(sk)->copy_thresh = val;
1367                 return 0;
1368         }
1369 #endif
1370         default:
1371                 return -ENOPROTOOPT;
1372         }
1373 }
1374
1375 static int packet_getsockopt(struct socket *sock, int level, int optname,
1376                              char __user *optval, int __user *optlen)
1377 {
1378         int len;
1379         struct sock *sk = sock->sk;
1380         struct packet_sock *po = pkt_sk(sk);
1381
1382         if (level != SOL_PACKET)
1383                 return -ENOPROTOOPT;
1384
1385         if (get_user(len, optlen))
1386                 return -EFAULT;
1387
1388         if (len < 0)
1389                 return -EINVAL;
1390                 
1391         switch(optname) {
1392         case PACKET_STATISTICS:
1393         {
1394                 struct tpacket_stats st;
1395
1396                 if (len > sizeof(struct tpacket_stats))
1397                         len = sizeof(struct tpacket_stats);
1398                 spin_lock_bh(&sk->sk_receive_queue.lock);
1399                 st = po->stats;
1400                 memset(&po->stats, 0, sizeof(st));
1401                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1402                 st.tp_packets += st.tp_drops;
1403
1404                 if (copy_to_user(optval, &st, len))
1405                         return -EFAULT;
1406                 break;
1407         }
1408         default:
1409                 return -ENOPROTOOPT;
1410         }
1411
1412         if (put_user(len, optlen))
1413                 return -EFAULT;
1414         return 0;
1415 }
1416
1417
1418 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1419 {
1420         struct sock *sk;
1421         struct hlist_node *node;
1422         struct net_device *dev = (struct net_device*)data;
1423
1424         read_lock(&packet_sklist_lock);
1425         sk_for_each(sk, node, &packet_sklist) {
1426                 struct packet_sock *po = pkt_sk(sk);
1427
1428                 switch (msg) {
1429                 case NETDEV_UNREGISTER:
1430 #ifdef CONFIG_PACKET_MULTICAST
1431                         if (po->mclist)
1432                                 packet_dev_mclist(dev, po->mclist, -1);
1433                         // fallthrough
1434 #endif
1435                 case NETDEV_DOWN:
1436                         if (dev->ifindex == po->ifindex) {
1437                                 spin_lock(&po->bind_lock);
1438                                 if (po->running) {
1439                                         __dev_remove_pack(&po->prot_hook);
1440                                         __sock_put(sk);
1441                                         po->running = 0;
1442                                         sk->sk_err = ENETDOWN;
1443                                         if (!sock_flag(sk, SOCK_DEAD))
1444                                                 sk->sk_error_report(sk);
1445                                 }
1446                                 if (msg == NETDEV_UNREGISTER) {
1447                                         po->ifindex = -1;
1448                                         po->prot_hook.dev = NULL;
1449                                 }
1450                                 spin_unlock(&po->bind_lock);
1451                         }
1452                         break;
1453                 case NETDEV_UP:
1454                         spin_lock(&po->bind_lock);
1455                         if (dev->ifindex == po->ifindex && po->num &&
1456                             !po->running) {
1457                                 dev_add_pack(&po->prot_hook);
1458                                 sock_hold(sk);
1459                                 po->running = 1;
1460                         }
1461                         spin_unlock(&po->bind_lock);
1462                         break;
1463                 }
1464         }
1465         read_unlock(&packet_sklist_lock);
1466         return NOTIFY_DONE;
1467 }
1468
1469
1470 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1471                         unsigned long arg)
1472 {
1473         struct sock *sk = sock->sk;
1474
1475         switch(cmd) {
1476                 case SIOCOUTQ:
1477                 {
1478                         int amount = atomic_read(&sk->sk_wmem_alloc);
1479                         return put_user(amount, (int __user *)arg);
1480                 }
1481                 case SIOCINQ:
1482                 {
1483                         struct sk_buff *skb;
1484                         int amount = 0;
1485
1486                         spin_lock_bh(&sk->sk_receive_queue.lock);
1487                         skb = skb_peek(&sk->sk_receive_queue);
1488                         if (skb)
1489                                 amount = skb->len;
1490                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1491                         return put_user(amount, (int __user *)arg);
1492                 }
1493                 case SIOCGSTAMP:
1494                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1495                         
1496 #ifdef CONFIG_INET
1497                 case SIOCADDRT:
1498                 case SIOCDELRT:
1499                 case SIOCDARP:
1500                 case SIOCGARP:
1501                 case SIOCSARP:
1502                 case SIOCGIFADDR:
1503                 case SIOCSIFADDR:
1504                 case SIOCGIFBRDADDR:
1505                 case SIOCSIFBRDADDR:
1506                 case SIOCGIFNETMASK:
1507                 case SIOCSIFNETMASK:
1508                 case SIOCGIFDSTADDR:
1509                 case SIOCSIFDSTADDR:
1510                 case SIOCSIFFLAGS:
1511                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1512 #endif
1513
1514                 default:
1515                         return -ENOIOCTLCMD;
1516         }
1517         return 0;
1518 }
1519
1520 #ifndef CONFIG_PACKET_MMAP
1521 #define packet_mmap sock_no_mmap
1522 #define packet_poll datagram_poll
1523 #else
1524
1525 static unsigned int packet_poll(struct file * file, struct socket *sock,
1526                                 poll_table *wait)
1527 {
1528         struct sock *sk = sock->sk;
1529         struct packet_sock *po = pkt_sk(sk);
1530         unsigned int mask = datagram_poll(file, sock, wait);
1531
1532         spin_lock_bh(&sk->sk_receive_queue.lock);
1533         if (po->pg_vec) {
1534                 unsigned last = po->head ? po->head-1 : po->frame_max;
1535                 struct tpacket_hdr *h;
1536
1537                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1538
1539                 if (h->tp_status)
1540                         mask |= POLLIN | POLLRDNORM;
1541         }
1542         spin_unlock_bh(&sk->sk_receive_queue.lock);
1543         return mask;
1544 }
1545
1546
1547 /* Dirty? Well, I still did not learn better way to account
1548  * for user mmaps.
1549  */
1550
1551 static void packet_mm_open(struct vm_area_struct *vma)
1552 {
1553         struct file *file = vma->vm_file;
1554         struct socket * sock = file->private_data;
1555         struct sock *sk = sock->sk;
1556         
1557         if (sk)
1558                 atomic_inc(&pkt_sk(sk)->mapped);
1559 }
1560
1561 static void packet_mm_close(struct vm_area_struct *vma)
1562 {
1563         struct file *file = vma->vm_file;
1564         struct socket * sock = file->private_data;
1565         struct sock *sk = sock->sk;
1566         
1567         if (sk)
1568                 atomic_dec(&pkt_sk(sk)->mapped);
1569 }
1570
1571 static struct vm_operations_struct packet_mmap_ops = {
1572         .open = packet_mm_open,
1573         .close =packet_mm_close,
1574 };
1575
1576 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1577 {
1578         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1579 }
1580
1581 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1582 {
1583         int i;
1584
1585         for (i = 0; i < len; i++) {
1586                 if (likely(pg_vec[i]))
1587                         free_pages((unsigned long) pg_vec[i], order);
1588         }
1589         kfree(pg_vec);
1590 }
1591
1592 static inline char *alloc_one_pg_vec_page(unsigned long order)
1593 {
1594         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1595                                          order);
1596 }
1597
1598 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1599 {
1600         unsigned int block_nr = req->tp_block_nr;
1601         char **pg_vec;
1602         int i;
1603
1604         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1605         if (unlikely(!pg_vec))
1606                 goto out;
1607
1608         for (i = 0; i < block_nr; i++) {
1609                 pg_vec[i] = alloc_one_pg_vec_page(order);
1610                 if (unlikely(!pg_vec[i]))
1611                         goto out_free_pgvec;
1612         }
1613
1614 out:
1615         return pg_vec;
1616
1617 out_free_pgvec:
1618         free_pg_vec(pg_vec, order, block_nr);
1619         pg_vec = NULL;
1620         goto out;
1621 }
1622
1623 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1624 {
1625         char **pg_vec = NULL;
1626         struct packet_sock *po = pkt_sk(sk);
1627         int was_running, num, order = 0;
1628         int err = 0;
1629         
1630         if (req->tp_block_nr) {
1631                 int i, l;
1632
1633                 /* Sanity tests and some calculations */
1634
1635                 if (unlikely(po->pg_vec))
1636                         return -EBUSY;
1637
1638                 if (unlikely((int)req->tp_block_size <= 0))
1639                         return -EINVAL;
1640                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1641                         return -EINVAL;
1642                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1643                         return -EINVAL;
1644                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1645                         return -EINVAL;
1646
1647                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1648                 if (unlikely(po->frames_per_block <= 0))
1649                         return -EINVAL;
1650                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1651                              req->tp_frame_nr))
1652                         return -EINVAL;
1653
1654                 err = -ENOMEM;
1655                 order = get_order(req->tp_block_size);
1656                 pg_vec = alloc_pg_vec(req, order);
1657                 if (unlikely(!pg_vec))
1658                         goto out;
1659
1660                 l = 0;
1661                 for (i = 0; i < req->tp_block_nr; i++) {
1662                         char *ptr = pg_vec[i];
1663                         struct tpacket_hdr *header;
1664                         int k;
1665
1666                         for (k = 0; k < po->frames_per_block; k++) {
1667                                 header = (struct tpacket_hdr *) ptr;
1668                                 header->tp_status = TP_STATUS_KERNEL;
1669                                 ptr += req->tp_frame_size;
1670                         }
1671                 }
1672                 /* Done */
1673         } else {
1674                 if (unlikely(req->tp_frame_nr))
1675                         return -EINVAL;
1676         }
1677
1678         lock_sock(sk);
1679
1680         /* Detach socket from network */
1681         spin_lock(&po->bind_lock);
1682         was_running = po->running;
1683         num = po->num;
1684         if (was_running) {
1685                 __dev_remove_pack(&po->prot_hook);
1686                 po->num = 0;
1687                 po->running = 0;
1688                 __sock_put(sk);
1689         }
1690         spin_unlock(&po->bind_lock);
1691                 
1692         synchronize_net();
1693
1694         err = -EBUSY;
1695         if (closing || atomic_read(&po->mapped) == 0) {
1696                 err = 0;
1697 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1698
1699                 spin_lock_bh(&sk->sk_receive_queue.lock);
1700                 pg_vec = XC(po->pg_vec, pg_vec);
1701                 po->frame_max = (req->tp_frame_nr - 1);
1702                 po->head = 0;
1703                 po->frame_size = req->tp_frame_size;
1704                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1705
1706                 order = XC(po->pg_vec_order, order);
1707                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1708
1709                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1710                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1711                 skb_queue_purge(&sk->sk_receive_queue);
1712 #undef XC
1713                 if (atomic_read(&po->mapped))
1714                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1715         }
1716
1717         spin_lock(&po->bind_lock);
1718         if (was_running && !po->running) {
1719                 sock_hold(sk);
1720                 po->running = 1;
1721                 po->num = num;
1722                 dev_add_pack(&po->prot_hook);
1723         }
1724         spin_unlock(&po->bind_lock);
1725
1726         release_sock(sk);
1727
1728         if (pg_vec)
1729                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1730 out:
1731         return err;
1732 }
1733
1734 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1735 {
1736         struct sock *sk = sock->sk;
1737         struct packet_sock *po = pkt_sk(sk);
1738         unsigned long size;
1739         unsigned long start;
1740         int err = -EINVAL;
1741         int i;
1742
1743         if (vma->vm_pgoff)
1744                 return -EINVAL;
1745
1746         size = vma->vm_end - vma->vm_start;
1747
1748         lock_sock(sk);
1749         if (po->pg_vec == NULL)
1750                 goto out;
1751         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1752                 goto out;
1753
1754         start = vma->vm_start;
1755         for (i = 0; i < po->pg_vec_len; i++) {
1756                 struct page *page = virt_to_page(po->pg_vec[i]);
1757                 int pg_num;
1758
1759                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1760                         err = vm_insert_page(vma, start, page);
1761                         if (unlikely(err))
1762                                 goto out;
1763                         start += PAGE_SIZE;
1764                 }
1765         }
1766         atomic_inc(&po->mapped);
1767         vma->vm_ops = &packet_mmap_ops;
1768         err = 0;
1769
1770 out:
1771         release_sock(sk);
1772         return err;
1773 }
1774 #endif
1775
1776
1777 #ifdef CONFIG_SOCK_PACKET
1778 static const struct proto_ops packet_ops_spkt = {
1779         .family =       PF_PACKET,
1780         .owner =        THIS_MODULE,
1781         .release =      packet_release,
1782         .bind =         packet_bind_spkt,
1783         .connect =      sock_no_connect,
1784         .socketpair =   sock_no_socketpair,
1785         .accept =       sock_no_accept,
1786         .getname =      packet_getname_spkt,
1787         .poll =         datagram_poll,
1788         .ioctl =        packet_ioctl,
1789         .listen =       sock_no_listen,
1790         .shutdown =     sock_no_shutdown,
1791         .setsockopt =   sock_no_setsockopt,
1792         .getsockopt =   sock_no_getsockopt,
1793         .sendmsg =      packet_sendmsg_spkt,
1794         .recvmsg =      packet_recvmsg,
1795         .mmap =         sock_no_mmap,
1796         .sendpage =     sock_no_sendpage,
1797 };
1798 #endif
1799
1800 static const struct proto_ops packet_ops = {
1801         .family =       PF_PACKET,
1802         .owner =        THIS_MODULE,
1803         .release =      packet_release,
1804         .bind =         packet_bind,
1805         .connect =      sock_no_connect,
1806         .socketpair =   sock_no_socketpair,
1807         .accept =       sock_no_accept,
1808         .getname =      packet_getname, 
1809         .poll =         packet_poll,
1810         .ioctl =        packet_ioctl,
1811         .listen =       sock_no_listen,
1812         .shutdown =     sock_no_shutdown,
1813         .setsockopt =   packet_setsockopt,
1814         .getsockopt =   packet_getsockopt,
1815         .sendmsg =      packet_sendmsg,
1816         .recvmsg =      packet_recvmsg,
1817         .mmap =         packet_mmap,
1818         .sendpage =     sock_no_sendpage,
1819 };
1820
1821 static struct net_proto_family packet_family_ops = {
1822         .family =       PF_PACKET,
1823         .create =       packet_create,
1824         .owner  =       THIS_MODULE,
1825 };
1826
1827 static struct notifier_block packet_netdev_notifier = {
1828         .notifier_call =packet_notifier,
1829 };
1830
1831 #ifdef CONFIG_PROC_FS
1832 static inline struct sock *packet_seq_idx(loff_t off)
1833 {
1834         struct sock *s;
1835         struct hlist_node *node;
1836
1837         sk_for_each(s, node, &packet_sklist) {
1838                 if (!off--)
1839                         return s;
1840         }
1841         return NULL;
1842 }
1843
1844 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1845 {
1846         read_lock(&packet_sklist_lock);
1847         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1848 }
1849
1850 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1851 {
1852         ++*pos;
1853         return  (v == SEQ_START_TOKEN) 
1854                 ? sk_head(&packet_sklist) 
1855                 : sk_next((struct sock*)v) ;
1856 }
1857
1858 static void packet_seq_stop(struct seq_file *seq, void *v)
1859 {
1860         read_unlock(&packet_sklist_lock);               
1861 }
1862
1863 static int packet_seq_show(struct seq_file *seq, void *v) 
1864 {
1865         if (v == SEQ_START_TOKEN)
1866                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1867         else {
1868                 struct sock *s = v;
1869                 const struct packet_sock *po = pkt_sk(s);
1870
1871                 seq_printf(seq,
1872                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1873                            s,
1874                            atomic_read(&s->sk_refcnt),
1875                            s->sk_type,
1876                            ntohs(po->num),
1877                            po->ifindex,
1878                            po->running,
1879                            atomic_read(&s->sk_rmem_alloc),
1880                            sock_i_uid(s),
1881                            sock_i_ino(s) );
1882         }
1883
1884         return 0;
1885 }
1886
1887 static struct seq_operations packet_seq_ops = {
1888         .start  = packet_seq_start,
1889         .next   = packet_seq_next,
1890         .stop   = packet_seq_stop,
1891         .show   = packet_seq_show,
1892 };
1893
1894 static int packet_seq_open(struct inode *inode, struct file *file)
1895 {
1896         return seq_open(file, &packet_seq_ops);
1897 }
1898
1899 static struct file_operations packet_seq_fops = {
1900         .owner          = THIS_MODULE,
1901         .open           = packet_seq_open,
1902         .read           = seq_read,
1903         .llseek         = seq_lseek,
1904         .release        = seq_release,
1905 };
1906
1907 #endif
1908
1909 static void __exit packet_exit(void)
1910 {
1911         proc_net_remove("packet");
1912         unregister_netdevice_notifier(&packet_netdev_notifier);
1913         sock_unregister(PF_PACKET);
1914         proto_unregister(&packet_proto);
1915 }
1916
1917 static int __init packet_init(void)
1918 {
1919         int rc = proto_register(&packet_proto, 0);
1920
1921         if (rc != 0)
1922                 goto out;
1923
1924         sock_register(&packet_family_ops);
1925         register_netdevice_notifier(&packet_netdev_notifier);
1926         proc_net_fops_create("packet", 0, &packet_seq_fops);
1927 out:
1928         return rc;
1929 }
1930
1931 module_init(packet_init);
1932 module_exit(packet_exit);
1933 MODULE_LICENSE("GPL");
1934 MODULE_ALIAS_NETPROTO(PF_PACKET);