1 /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
4 * Filesystem request handling methods
8 #include <linux/hdreg.h>
9 #include <linux/blkdev.h>
10 #include <linux/skbuff.h>
11 #include <linux/netdevice.h>
12 #include <linux/genhd.h>
13 #include <linux/moduleparam.h>
14 #include <net/net_namespace.h>
15 #include <asm/unaligned.h>
18 static int aoe_deadsecs = 60 * 3;
19 module_param(aoe_deadsecs, int, 0644);
20 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
22 static int aoe_maxout = 16;
23 module_param(aoe_maxout, int, 0644);
24 MODULE_PARM_DESC(aoe_maxout,
25 "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
27 static struct sk_buff *
32 skb = alloc_skb(len, GFP_ATOMIC);
34 skb_reset_mac_header(skb);
35 skb_reset_network_header(skb);
36 skb->protocol = __constant_htons(ETH_P_AOE);
38 skb->next = skb->prev = NULL;
40 /* tell the network layer not to perform IP checksums
41 * or to get the NIC to do it
43 skb->ip_summed = CHECKSUM_NONE;
49 getframe(struct aoetgt *t, int tag)
62 * Leave the top bit clear so we have tagspace for userland.
63 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
64 * This driver reserves tag -1 to mean "unused frame."
67 newtag(struct aoetgt *t)
72 return n |= (++t->lasttag & 0x7fff) << 16;
76 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
78 u32 host_tag = newtag(t);
80 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
81 memcpy(h->dst, t->addr, sizeof h->dst);
82 h->type = __constant_cpu_to_be16(ETH_P_AOE);
84 h->major = cpu_to_be16(d->aoemajor);
85 h->minor = d->aoeminor;
87 h->tag = cpu_to_be32(host_tag);
93 put_lba(struct aoe_atahdr *ah, sector_t lba)
100 ah->lba5 = lba >>= 8;
104 ifrotate(struct aoetgt *t)
107 if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
109 if (t->ifp->nd == NULL) {
110 printk(KERN_INFO "aoe: no interface to rotate to\n");
116 skb_pool_put(struct aoedev *d, struct sk_buff *skb)
118 __skb_queue_tail(&d->skbpool, skb);
121 static struct sk_buff *
122 skb_pool_get(struct aoedev *d)
124 struct sk_buff *skb = skb_peek(&d->skbpool);
126 if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
127 __skb_unlink(skb, &d->skbpool);
130 if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
131 (skb = new_skb(ETH_ZLEN)))
137 /* freeframe is where we do our load balancing so it's a little hairy. */
138 static struct frame *
139 freeframe(struct aoedev *d)
141 struct frame *f, *e, *rf;
145 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
146 printk(KERN_ERR "aoe: NULL TARGETS!\n");
151 if (t >= &d->targets[NTARGETS] || !*t)
154 if ((*t)->nout < (*t)->maxout
159 e = f + (*t)->nframes;
161 if (f->tag != FREETAG)
165 && !(f->skb = skb = new_skb(ETH_ZLEN)))
167 if (atomic_read(&skb_shinfo(skb)->dataref)
173 gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
179 /* Work can be done, but the network layer is
180 holding our precious packets. Try to grab
181 one from the pool. */
183 if (f == NULL) { /* more paranoia */
185 "aoe: freeframe: %s.\n",
186 "unexpected null rf");
187 d->flags |= DEVFL_KICKME;
190 skb = skb_pool_get(d);
192 skb_pool_put(d, f->skb);
198 d->flags |= DEVFL_KICKME;
200 if (t == d->tgt) /* we've looped and found nada */
203 if (t >= &d->targets[NTARGETS] || !*t)
210 aoecmd_ata_rw(struct aoedev *d)
214 struct aoe_atahdr *ah;
220 char writebit, extbit;
231 bcnt = t->ifp->maxbcnt;
234 if (bcnt > buf->bv_resid)
235 bcnt = buf->bv_resid;
236 /* initialize the headers & frame */
238 h = (struct aoe_hdr *) skb_mac_header(skb);
239 ah = (struct aoe_atahdr *) (h+1);
240 skb_put(skb, sizeof *h + sizeof *ah);
241 memset(h, 0, skb->len);
242 f->tag = aoehdr_atainit(d, t, h);
246 f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
248 f->lba = buf->sector;
250 /* set up ata header */
251 ah->scnt = bcnt >> 9;
252 put_lba(ah, buf->sector);
253 if (d->flags & DEVFL_EXT) {
254 ah->aflags |= AOEAFL_EXT;
258 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
260 if (bio_data_dir(buf->bio) == WRITE) {
261 skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
262 ah->aflags |= AOEAFL_WRITE;
264 skb->data_len = bcnt;
271 ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
273 /* mark all tracking fields and load out */
274 buf->nframesout += 1;
276 buf->bv_resid -= bcnt;
278 buf->sector += bcnt >> 9;
279 if (buf->resid == 0) {
281 } else if (buf->bv_resid == 0) {
283 buf->bv_resid = bv->bv_len;
284 WARN_ON(buf->bv_resid == 0);
285 buf->bv_off = bv->bv_offset;
288 skb->dev = t->ifp->nd;
289 skb = skb_clone(skb, GFP_ATOMIC);
291 __skb_queue_tail(&d->sendq, skb);
295 /* some callers cannot sleep, and they can call this function,
296 * transmitting the packets later, when interrupts are on
299 aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
302 struct aoe_cfghdr *ch;
304 struct net_device *ifp;
306 read_lock(&dev_base_lock);
307 for_each_netdev(&init_net, ifp) {
309 if (!is_aoe_netif(ifp))
312 skb = new_skb(sizeof *h + sizeof *ch);
314 printk(KERN_INFO "aoe: skb alloc failure\n");
317 skb_put(skb, sizeof *h + sizeof *ch);
319 __skb_queue_tail(queue, skb);
320 h = (struct aoe_hdr *) skb_mac_header(skb);
321 memset(h, 0, sizeof *h + sizeof *ch);
323 memset(h->dst, 0xff, sizeof h->dst);
324 memcpy(h->src, ifp->dev_addr, sizeof h->src);
325 h->type = __constant_cpu_to_be16(ETH_P_AOE);
327 h->major = cpu_to_be16(aoemajor);
334 read_unlock(&dev_base_lock);
338 resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
342 struct aoe_atahdr *ah;
349 h = (struct aoe_hdr *) skb_mac_header(skb);
350 ah = (struct aoe_atahdr *) (h+1);
352 snprintf(buf, sizeof buf,
353 "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
354 "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
355 h->src, h->dst, t->nout);
359 h->tag = cpu_to_be32(n);
360 memcpy(h->dst, t->addr, sizeof h->dst);
361 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
363 switch (ah->cmdstat) {
366 case ATA_CMD_PIO_READ:
367 case ATA_CMD_PIO_READ_EXT:
368 case ATA_CMD_PIO_WRITE:
369 case ATA_CMD_PIO_WRITE_EXT:
376 if (ah->aflags & AOEAFL_WRITE) {
377 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
378 offset_in_page(f->bufaddr), n);
379 skb->len = sizeof *h + sizeof *ah + n;
383 skb->dev = t->ifp->nd;
384 skb = skb_clone(skb, GFP_ATOMIC);
387 __skb_queue_tail(&d->sendq, skb);
395 n = jiffies & 0xffff;
402 static struct aoeif *
403 getif(struct aoetgt *t, struct net_device *nd)
415 static struct aoeif *
416 addif(struct aoetgt *t, struct net_device *nd)
424 p->maxbcnt = DEFAULTBCNT;
431 ejectif(struct aoetgt *t, struct aoeif *ifp)
436 e = t->ifs + NAOEIFS - 1;
437 n = (e - ifp) * sizeof *ifp;
438 memmove(ifp, ifp+1, n);
443 sthtith(struct aoedev *d)
445 struct frame *f, *e, *nf;
447 struct aoetgt *ht = *d->htgt;
452 if (f->tag == FREETAG)
464 resend(d, *d->tgt, nf);
466 /* he's clean, he's useless. take away his interfaces */
467 memset(ht->ifs, 0, sizeof ht->ifs);
472 static inline unsigned char
473 ata_scnt(unsigned char *packet) {
475 struct aoe_atahdr *ah;
477 h = (struct aoe_hdr *) packet;
478 ah = (struct aoe_atahdr *) (h+1);
483 rexmit_timer(ulong vp)
485 struct sk_buff_head queue;
487 struct aoetgt *t, **tt, **te;
490 register long timeout;
493 d = (struct aoedev *) vp;
495 /* timeout is always ~150% of the moving average */
497 timeout += timeout >> 1;
499 spin_lock_irqsave(&d->lock, flags);
501 if (d->flags & DEVFL_TKILL) {
502 spin_unlock_irqrestore(&d->lock, flags);
507 for (; tt < te && *tt; tt++) {
512 if (f->tag == FREETAG
513 || tsince(f->tag) < timeout)
515 n = f->waited += timeout;
517 if (n > aoe_deadsecs) {
518 /* waited too long. device failure. */
523 if (n > HELPWAIT /* see if another target can help */
524 && (tt != d->targets || d->targets[1]))
527 if (t->nout == t->maxout) {
530 t->lastwadj = jiffies;
533 ifp = getif(t, f->skb->dev);
534 if (ifp && ++ifp->lost > (t->nframes << 1)
535 && (ifp != t->ifs || t->ifs[1].nd)) {
540 if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
541 && ifp && ++ifp->lostjumbo > (t->nframes << 1)
542 && ifp->maxbcnt != DEFAULTBCNT) {
545 "too many lost jumbo on "
547 "falling back to %d frames.\n",
548 d->aoemajor, d->aoeminor,
549 ifp->nd->name, t->addr,
557 if (t->nout == t->maxout
558 && t->maxout < t->nframes
559 && (jiffies - t->lastwadj)/HZ > 10) {
561 t->lastwadj = jiffies;
565 if (!skb_queue_empty(&d->sendq)) {
568 d->rttavg = MAXTIMER;
571 if (d->flags & DEVFL_KICKME || d->htgt) {
572 d->flags &= ~DEVFL_KICKME;
576 __skb_queue_head_init(&queue);
577 skb_queue_splice_init(&d->sendq, &queue);
579 d->timer.expires = jiffies + TIMERTICK;
580 add_timer(&d->timer);
582 spin_unlock_irqrestore(&d->lock, flags);
587 /* enters with d->lock held */
589 aoecmd_work(struct aoedev *d)
593 if (d->htgt && !sthtith(d))
595 if (d->inprocess == NULL) {
596 if (list_empty(&d->bufq))
598 buf = container_of(d->bufq.next, struct buf, bufs);
599 list_del(d->bufq.next);
602 if (aoecmd_ata_rw(d))
606 /* this function performs work that has been deferred until sleeping is OK
609 aoecmd_sleepwork(struct work_struct *work)
611 struct aoedev *d = container_of(work, struct aoedev, work);
613 if (d->flags & DEVFL_GDALLOC)
616 if (d->flags & DEVFL_NEWSIZE) {
617 struct block_device *bd;
621 ssize = get_capacity(d->gd);
622 bd = bdget_disk(d->gd, 0);
625 mutex_lock(&bd->bd_inode->i_mutex);
626 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
627 mutex_unlock(&bd->bd_inode->i_mutex);
630 spin_lock_irqsave(&d->lock, flags);
631 d->flags |= DEVFL_UP;
632 d->flags &= ~DEVFL_NEWSIZE;
633 spin_unlock_irqrestore(&d->lock, flags);
638 ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
643 /* word 83: command set supported */
644 n = get_unaligned_le16(&id[83 << 1]);
646 /* word 86: command set/feature enabled */
647 n |= get_unaligned_le16(&id[86 << 1]);
649 if (n & (1<<10)) { /* bit 10: LBA 48 */
650 d->flags |= DEVFL_EXT;
652 /* word 100: number lba48 sectors */
653 ssize = get_unaligned_le64(&id[100 << 1]);
655 /* set as in ide-disk.c:init_idedisk_capacity */
656 d->geo.cylinders = ssize;
657 d->geo.cylinders /= (255 * 63);
661 d->flags &= ~DEVFL_EXT;
663 /* number lba28 sectors */
664 ssize = get_unaligned_le32(&id[60 << 1]);
666 /* NOTE: obsolete in ATA 6 */
667 d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
668 d->geo.heads = get_unaligned_le16(&id[55 << 1]);
669 d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
672 if (d->ssize != ssize)
674 "aoe: %pm e%ld.%d v%04x has %llu sectors\n",
676 d->aoemajor, d->aoeminor,
677 d->fw_ver, (long long)ssize);
680 if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
683 set_capacity(d->gd, ssize);
684 d->flags |= DEVFL_NEWSIZE;
686 d->flags |= DEVFL_GDALLOC;
687 schedule_work(&d->work);
691 calc_rttavg(struct aoedev *d, int rtt)
700 else if (n > MAXTIMER)
702 d->mintimer += (n - d->mintimer) >> 1;
703 } else if (n < d->mintimer)
705 else if (n > MAXTIMER)
708 /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
713 static struct aoetgt *
714 gettgt(struct aoedev *d, char *addr)
716 struct aoetgt **t, **e;
720 for (; t < e && *t; t++)
721 if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
727 diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
729 unsigned long n_sect = bio->bi_size >> 9;
730 const int rw = bio_data_dir(bio);
731 struct hd_struct *part;
734 cpu = part_stat_lock();
735 part = disk_map_sector_rcu(disk, sector);
737 part_stat_inc(cpu, part, ios[rw]);
738 part_stat_add(cpu, part, ticks[rw], duration);
739 part_stat_add(cpu, part, sectors[rw], n_sect);
740 part_stat_add(cpu, part, io_ticks, duration);
746 aoecmd_ata_rsp(struct sk_buff *skb)
748 struct sk_buff_head queue;
750 struct aoe_hdr *hin, *hout;
751 struct aoe_atahdr *ahin, *ahout;
761 hin = (struct aoe_hdr *) skb_mac_header(skb);
762 aoemajor = get_unaligned_be16(&hin->major);
763 d = aoedev_by_aoeaddr(aoemajor, hin->minor);
765 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
766 "for unknown device %d.%d\n",
767 aoemajor, hin->minor);
772 spin_lock_irqsave(&d->lock, flags);
774 n = get_unaligned_be32(&hin->tag);
775 t = gettgt(d, hin->src);
777 printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
778 d->aoemajor, d->aoeminor, hin->src);
779 spin_unlock_irqrestore(&d->lock, flags);
784 calc_rttavg(d, -tsince(n));
785 spin_unlock_irqrestore(&d->lock, flags);
786 snprintf(ebuf, sizeof ebuf,
787 "%15s e%d.%d tag=%08x@%08lx\n",
789 get_unaligned_be16(&hin->major),
791 get_unaligned_be32(&hin->tag),
797 calc_rttavg(d, tsince(f->tag));
799 ahin = (struct aoe_atahdr *) (hin+1);
800 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
801 ahout = (struct aoe_atahdr *) (hout+1);
804 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
806 "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
807 ahout->cmdstat, ahin->cmdstat,
808 d->aoemajor, d->aoeminor);
810 buf->flags |= BUFFL_FAIL;
812 if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
814 n = ahout->scnt << 9;
815 switch (ahout->cmdstat) {
816 case ATA_CMD_PIO_READ:
817 case ATA_CMD_PIO_READ_EXT:
818 if (skb->len - sizeof *hin - sizeof *ahin < n) {
820 "aoe: %s. skb->len=%d need=%ld\n",
821 "runt data size in read", skb->len, n);
822 /* fail frame f? just returning will rexmit. */
823 spin_unlock_irqrestore(&d->lock, flags);
826 memcpy(f->bufaddr, ahin+1, n);
827 case ATA_CMD_PIO_WRITE:
828 case ATA_CMD_PIO_WRITE_EXT:
829 ifp = getif(t, skb->dev);
843 if (skb->len - sizeof *hin - sizeof *ahin < 512) {
845 "aoe: runt data size in ataid. skb->len=%d\n",
847 spin_unlock_irqrestore(&d->lock, flags);
850 ataid_complete(d, t, (char *) (ahin+1));
854 "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
856 get_unaligned_be16(&hin->major),
861 if (buf && --buf->nframesout == 0 && buf->resid == 0) {
862 diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
863 n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
864 bio_endio(buf->bio, n);
865 mempool_free(buf, d->bufpool);
874 __skb_queue_head_init(&queue);
875 skb_queue_splice_init(&d->sendq, &queue);
877 spin_unlock_irqrestore(&d->lock, flags);
882 aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
884 struct sk_buff_head queue;
886 __skb_queue_head_init(&queue);
887 aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
892 aoecmd_ata_id(struct aoedev *d)
895 struct aoe_atahdr *ah;
906 /* initialize the headers & frame */
908 h = (struct aoe_hdr *) skb_mac_header(skb);
909 ah = (struct aoe_atahdr *) (h+1);
910 skb_put(skb, sizeof *h + sizeof *ah);
911 memset(h, 0, skb->len);
912 f->tag = aoehdr_atainit(d, t, h);
916 /* set up ata header */
918 ah->cmdstat = ATA_CMD_ID_ATA;
921 skb->dev = t->ifp->nd;
923 d->rttavg = MAXTIMER;
924 d->timer.function = rexmit_timer;
926 return skb_clone(skb, GFP_ATOMIC);
929 static struct aoetgt *
930 addtgt(struct aoedev *d, char *addr, ulong nframes)
932 struct aoetgt *t, **tt, **te;
937 for (; tt < te && *tt; tt++)
942 "aoe: device addtgt failure; too many targets\n");
945 t = kcalloc(1, sizeof *t, GFP_ATOMIC);
946 f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
950 printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
954 t->nframes = nframes;
959 memcpy(t->addr, addr, sizeof t->addr);
961 t->maxout = t->nframes;
966 aoecmd_cfg_rsp(struct sk_buff *skb)
970 struct aoe_cfghdr *ch;
973 ulong flags, sysminor, aoemajor;
977 h = (struct aoe_hdr *) skb_mac_header(skb);
978 ch = (struct aoe_cfghdr *) (h+1);
981 * Enough people have their dip switches set backwards to
982 * warrant a loud message for this special case.
984 aoemajor = get_unaligned_be16(&h->major);
985 if (aoemajor == 0xfff) {
986 printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
987 "Check shelf dip switches.\n");
991 sysminor = SYSMINOR(aoemajor, h->minor);
992 if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
993 printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
994 aoemajor, (int) h->minor);
998 n = be16_to_cpu(ch->bufcnt);
999 if (n > aoe_maxout) /* keep it reasonable */
1002 d = aoedev_by_sysminor_m(sysminor);
1004 printk(KERN_INFO "aoe: device sysminor_m failure\n");
1008 spin_lock_irqsave(&d->lock, flags);
1010 t = gettgt(d, h->src);
1012 t = addtgt(d, h->src, n);
1014 spin_unlock_irqrestore(&d->lock, flags);
1018 ifp = getif(t, skb->dev);
1020 ifp = addif(t, skb->dev);
1023 "aoe: device addif failure; "
1024 "too many interfaces?\n");
1025 spin_unlock_irqrestore(&d->lock, flags);
1031 n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
1035 n = n ? n * 512 : DEFAULTBCNT;
1036 if (n != ifp->maxbcnt) {
1038 "aoe: e%ld.%d: setting %d%s%s:%pm\n",
1039 d->aoemajor, d->aoeminor, n,
1040 " byte data frames on ", ifp->nd->name,
1046 /* don't change users' perspective */
1048 spin_unlock_irqrestore(&d->lock, flags);
1051 d->fw_ver = be16_to_cpu(ch->fwver);
1053 sl = aoecmd_ata_id(d);
1055 spin_unlock_irqrestore(&d->lock, flags);
1058 struct sk_buff_head queue;
1059 __skb_queue_head_init(&queue);
1060 __skb_queue_tail(&queue, sl);
1061 aoenet_xmit(&queue);
1066 aoecmd_cleanslate(struct aoedev *d)
1068 struct aoetgt **t, **te;
1069 struct aoeif *p, *e;
1071 d->mintimer = MINTIMER;
1075 for (; t < te && *t; t++) {
1076 (*t)->maxout = (*t)->nframes;
1079 for (; p < e; p++) {
1082 p->maxbcnt = DEFAULTBCNT;