1 /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
4 * Filesystem request handling methods
8 #include <linux/hdreg.h>
9 #include <linux/blkdev.h>
10 #include <linux/skbuff.h>
11 #include <linux/netdevice.h>
12 #include <linux/genhd.h>
13 #include <linux/moduleparam.h>
14 #include <net/net_namespace.h>
15 #include <asm/unaligned.h>
18 static int aoe_deadsecs = 60 * 3;
19 module_param(aoe_deadsecs, int, 0644);
20 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
22 static int aoe_maxout = 16;
23 module_param(aoe_maxout, int, 0644);
24 MODULE_PARM_DESC(aoe_maxout,
25 "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
27 static struct sk_buff *
32 skb = alloc_skb(len, GFP_ATOMIC);
34 skb_reset_mac_header(skb);
35 skb_reset_network_header(skb);
36 skb->protocol = __constant_htons(ETH_P_AOE);
42 getframe(struct aoetgt *t, int tag)
55 * Leave the top bit clear so we have tagspace for userland.
56 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
57 * This driver reserves tag -1 to mean "unused frame."
60 newtag(struct aoetgt *t)
65 return n |= (++t->lasttag & 0x7fff) << 16;
69 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
71 u32 host_tag = newtag(t);
73 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
74 memcpy(h->dst, t->addr, sizeof h->dst);
75 h->type = __constant_cpu_to_be16(ETH_P_AOE);
77 h->major = cpu_to_be16(d->aoemajor);
78 h->minor = d->aoeminor;
80 h->tag = cpu_to_be32(host_tag);
86 put_lba(struct aoe_atahdr *ah, sector_t lba)
97 ifrotate(struct aoetgt *t)
100 if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
102 if (t->ifp->nd == NULL) {
103 printk(KERN_INFO "aoe: no interface to rotate to\n");
109 skb_pool_put(struct aoedev *d, struct sk_buff *skb)
111 __skb_queue_tail(&d->skbpool, skb);
114 static struct sk_buff *
115 skb_pool_get(struct aoedev *d)
117 struct sk_buff *skb = skb_peek(&d->skbpool);
119 if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
120 __skb_unlink(skb, &d->skbpool);
123 if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
124 (skb = new_skb(ETH_ZLEN)))
130 /* freeframe is where we do our load balancing so it's a little hairy. */
131 static struct frame *
132 freeframe(struct aoedev *d)
134 struct frame *f, *e, *rf;
138 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
139 printk(KERN_ERR "aoe: NULL TARGETS!\n");
144 if (t >= &d->targets[NTARGETS] || !*t)
147 if ((*t)->nout < (*t)->maxout
152 e = f + (*t)->nframes;
154 if (f->tag != FREETAG)
158 && !(f->skb = skb = new_skb(ETH_ZLEN)))
160 if (atomic_read(&skb_shinfo(skb)->dataref)
166 gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
172 /* Work can be done, but the network layer is
173 holding our precious packets. Try to grab
174 one from the pool. */
176 if (f == NULL) { /* more paranoia */
178 "aoe: freeframe: %s.\n",
179 "unexpected null rf");
180 d->flags |= DEVFL_KICKME;
183 skb = skb_pool_get(d);
185 skb_pool_put(d, f->skb);
191 d->flags |= DEVFL_KICKME;
193 if (t == d->tgt) /* we've looped and found nada */
196 if (t >= &d->targets[NTARGETS] || !*t)
203 aoecmd_ata_rw(struct aoedev *d)
207 struct aoe_atahdr *ah;
213 char writebit, extbit;
224 bcnt = t->ifp->maxbcnt;
227 if (bcnt > buf->bv_resid)
228 bcnt = buf->bv_resid;
229 /* initialize the headers & frame */
231 h = (struct aoe_hdr *) skb_mac_header(skb);
232 ah = (struct aoe_atahdr *) (h+1);
233 skb_put(skb, sizeof *h + sizeof *ah);
234 memset(h, 0, skb->len);
235 f->tag = aoehdr_atainit(d, t, h);
239 f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
241 f->lba = buf->sector;
243 /* set up ata header */
244 ah->scnt = bcnt >> 9;
245 put_lba(ah, buf->sector);
246 if (d->flags & DEVFL_EXT) {
247 ah->aflags |= AOEAFL_EXT;
251 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
253 if (bio_data_dir(buf->bio) == WRITE) {
254 skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
255 ah->aflags |= AOEAFL_WRITE;
257 skb->data_len = bcnt;
264 ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
266 /* mark all tracking fields and load out */
267 buf->nframesout += 1;
269 buf->bv_resid -= bcnt;
271 buf->sector += bcnt >> 9;
272 if (buf->resid == 0) {
274 } else if (buf->bv_resid == 0) {
276 buf->bv_resid = bv->bv_len;
277 WARN_ON(buf->bv_resid == 0);
278 buf->bv_off = bv->bv_offset;
281 skb->dev = t->ifp->nd;
282 skb = skb_clone(skb, GFP_ATOMIC);
284 __skb_queue_tail(&d->sendq, skb);
288 /* some callers cannot sleep, and they can call this function,
289 * transmitting the packets later, when interrupts are on
292 aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
295 struct aoe_cfghdr *ch;
297 struct net_device *ifp;
299 read_lock(&dev_base_lock);
300 for_each_netdev(&init_net, ifp) {
302 if (!is_aoe_netif(ifp))
305 skb = new_skb(sizeof *h + sizeof *ch);
307 printk(KERN_INFO "aoe: skb alloc failure\n");
310 skb_put(skb, sizeof *h + sizeof *ch);
312 __skb_queue_tail(queue, skb);
313 h = (struct aoe_hdr *) skb_mac_header(skb);
314 memset(h, 0, sizeof *h + sizeof *ch);
316 memset(h->dst, 0xff, sizeof h->dst);
317 memcpy(h->src, ifp->dev_addr, sizeof h->src);
318 h->type = __constant_cpu_to_be16(ETH_P_AOE);
320 h->major = cpu_to_be16(aoemajor);
327 read_unlock(&dev_base_lock);
331 resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
335 struct aoe_atahdr *ah;
342 h = (struct aoe_hdr *) skb_mac_header(skb);
343 ah = (struct aoe_atahdr *) (h+1);
345 snprintf(buf, sizeof buf,
346 "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
347 "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
348 h->src, h->dst, t->nout);
352 h->tag = cpu_to_be32(n);
353 memcpy(h->dst, t->addr, sizeof h->dst);
354 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
356 switch (ah->cmdstat) {
359 case ATA_CMD_PIO_READ:
360 case ATA_CMD_PIO_READ_EXT:
361 case ATA_CMD_PIO_WRITE:
362 case ATA_CMD_PIO_WRITE_EXT:
369 if (ah->aflags & AOEAFL_WRITE) {
370 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
371 offset_in_page(f->bufaddr), n);
372 skb->len = sizeof *h + sizeof *ah + n;
376 skb->dev = t->ifp->nd;
377 skb = skb_clone(skb, GFP_ATOMIC);
380 __skb_queue_tail(&d->sendq, skb);
388 n = jiffies & 0xffff;
395 static struct aoeif *
396 getif(struct aoetgt *t, struct net_device *nd)
408 static struct aoeif *
409 addif(struct aoetgt *t, struct net_device *nd)
417 p->maxbcnt = DEFAULTBCNT;
424 ejectif(struct aoetgt *t, struct aoeif *ifp)
429 e = t->ifs + NAOEIFS - 1;
430 n = (e - ifp) * sizeof *ifp;
431 memmove(ifp, ifp+1, n);
436 sthtith(struct aoedev *d)
438 struct frame *f, *e, *nf;
440 struct aoetgt *ht = *d->htgt;
445 if (f->tag == FREETAG)
457 resend(d, *d->tgt, nf);
459 /* he's clean, he's useless. take away his interfaces */
460 memset(ht->ifs, 0, sizeof ht->ifs);
465 static inline unsigned char
466 ata_scnt(unsigned char *packet) {
468 struct aoe_atahdr *ah;
470 h = (struct aoe_hdr *) packet;
471 ah = (struct aoe_atahdr *) (h+1);
476 rexmit_timer(ulong vp)
478 struct sk_buff_head queue;
480 struct aoetgt *t, **tt, **te;
483 register long timeout;
486 d = (struct aoedev *) vp;
488 /* timeout is always ~150% of the moving average */
490 timeout += timeout >> 1;
492 spin_lock_irqsave(&d->lock, flags);
494 if (d->flags & DEVFL_TKILL) {
495 spin_unlock_irqrestore(&d->lock, flags);
500 for (; tt < te && *tt; tt++) {
505 if (f->tag == FREETAG
506 || tsince(f->tag) < timeout)
508 n = f->waited += timeout;
510 if (n > aoe_deadsecs) {
511 /* waited too long. device failure. */
516 if (n > HELPWAIT /* see if another target can help */
517 && (tt != d->targets || d->targets[1]))
520 if (t->nout == t->maxout) {
523 t->lastwadj = jiffies;
526 ifp = getif(t, f->skb->dev);
527 if (ifp && ++ifp->lost > (t->nframes << 1)
528 && (ifp != t->ifs || t->ifs[1].nd)) {
533 if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
534 && ifp && ++ifp->lostjumbo > (t->nframes << 1)
535 && ifp->maxbcnt != DEFAULTBCNT) {
538 "too many lost jumbo on "
540 "falling back to %d frames.\n",
541 d->aoemajor, d->aoeminor,
542 ifp->nd->name, t->addr,
550 if (t->nout == t->maxout
551 && t->maxout < t->nframes
552 && (jiffies - t->lastwadj)/HZ > 10) {
554 t->lastwadj = jiffies;
558 if (!skb_queue_empty(&d->sendq)) {
561 d->rttavg = MAXTIMER;
564 if (d->flags & DEVFL_KICKME || d->htgt) {
565 d->flags &= ~DEVFL_KICKME;
569 __skb_queue_head_init(&queue);
570 skb_queue_splice_init(&d->sendq, &queue);
572 d->timer.expires = jiffies + TIMERTICK;
573 add_timer(&d->timer);
575 spin_unlock_irqrestore(&d->lock, flags);
580 /* enters with d->lock held */
582 aoecmd_work(struct aoedev *d)
586 if (d->htgt && !sthtith(d))
588 if (d->inprocess == NULL) {
589 if (list_empty(&d->bufq))
591 buf = container_of(d->bufq.next, struct buf, bufs);
592 list_del(d->bufq.next);
595 if (aoecmd_ata_rw(d))
599 /* this function performs work that has been deferred until sleeping is OK
602 aoecmd_sleepwork(struct work_struct *work)
604 struct aoedev *d = container_of(work, struct aoedev, work);
606 if (d->flags & DEVFL_GDALLOC)
609 if (d->flags & DEVFL_NEWSIZE) {
610 struct block_device *bd;
614 ssize = get_capacity(d->gd);
615 bd = bdget_disk(d->gd, 0);
618 mutex_lock(&bd->bd_inode->i_mutex);
619 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
620 mutex_unlock(&bd->bd_inode->i_mutex);
623 spin_lock_irqsave(&d->lock, flags);
624 d->flags |= DEVFL_UP;
625 d->flags &= ~DEVFL_NEWSIZE;
626 spin_unlock_irqrestore(&d->lock, flags);
631 ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
636 /* word 83: command set supported */
637 n = get_unaligned_le16(&id[83 << 1]);
639 /* word 86: command set/feature enabled */
640 n |= get_unaligned_le16(&id[86 << 1]);
642 if (n & (1<<10)) { /* bit 10: LBA 48 */
643 d->flags |= DEVFL_EXT;
645 /* word 100: number lba48 sectors */
646 ssize = get_unaligned_le64(&id[100 << 1]);
648 /* set as in ide-disk.c:init_idedisk_capacity */
649 d->geo.cylinders = ssize;
650 d->geo.cylinders /= (255 * 63);
654 d->flags &= ~DEVFL_EXT;
656 /* number lba28 sectors */
657 ssize = get_unaligned_le32(&id[60 << 1]);
659 /* NOTE: obsolete in ATA 6 */
660 d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
661 d->geo.heads = get_unaligned_le16(&id[55 << 1]);
662 d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
665 if (d->ssize != ssize)
667 "aoe: %pm e%ld.%d v%04x has %llu sectors\n",
669 d->aoemajor, d->aoeminor,
670 d->fw_ver, (long long)ssize);
673 if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
676 set_capacity(d->gd, ssize);
677 d->flags |= DEVFL_NEWSIZE;
679 d->flags |= DEVFL_GDALLOC;
680 schedule_work(&d->work);
684 calc_rttavg(struct aoedev *d, int rtt)
693 else if (n > MAXTIMER)
695 d->mintimer += (n - d->mintimer) >> 1;
696 } else if (n < d->mintimer)
698 else if (n > MAXTIMER)
701 /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
706 static struct aoetgt *
707 gettgt(struct aoedev *d, char *addr)
709 struct aoetgt **t, **e;
713 for (; t < e && *t; t++)
714 if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
720 diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
722 unsigned long n_sect = bio->bi_size >> 9;
723 const int rw = bio_data_dir(bio);
724 struct hd_struct *part;
727 cpu = part_stat_lock();
728 part = disk_map_sector_rcu(disk, sector);
730 part_stat_inc(cpu, part, ios[rw]);
731 part_stat_add(cpu, part, ticks[rw], duration);
732 part_stat_add(cpu, part, sectors[rw], n_sect);
733 part_stat_add(cpu, part, io_ticks, duration);
739 aoecmd_ata_rsp(struct sk_buff *skb)
741 struct sk_buff_head queue;
743 struct aoe_hdr *hin, *hout;
744 struct aoe_atahdr *ahin, *ahout;
754 hin = (struct aoe_hdr *) skb_mac_header(skb);
755 aoemajor = get_unaligned_be16(&hin->major);
756 d = aoedev_by_aoeaddr(aoemajor, hin->minor);
758 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
759 "for unknown device %d.%d\n",
760 aoemajor, hin->minor);
765 spin_lock_irqsave(&d->lock, flags);
767 n = get_unaligned_be32(&hin->tag);
768 t = gettgt(d, hin->src);
770 printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
771 d->aoemajor, d->aoeminor, hin->src);
772 spin_unlock_irqrestore(&d->lock, flags);
777 calc_rttavg(d, -tsince(n));
778 spin_unlock_irqrestore(&d->lock, flags);
779 snprintf(ebuf, sizeof ebuf,
780 "%15s e%d.%d tag=%08x@%08lx\n",
782 get_unaligned_be16(&hin->major),
784 get_unaligned_be32(&hin->tag),
790 calc_rttavg(d, tsince(f->tag));
792 ahin = (struct aoe_atahdr *) (hin+1);
793 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
794 ahout = (struct aoe_atahdr *) (hout+1);
797 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
799 "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
800 ahout->cmdstat, ahin->cmdstat,
801 d->aoemajor, d->aoeminor);
803 buf->flags |= BUFFL_FAIL;
805 if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
807 n = ahout->scnt << 9;
808 switch (ahout->cmdstat) {
809 case ATA_CMD_PIO_READ:
810 case ATA_CMD_PIO_READ_EXT:
811 if (skb->len - sizeof *hin - sizeof *ahin < n) {
813 "aoe: %s. skb->len=%d need=%ld\n",
814 "runt data size in read", skb->len, n);
815 /* fail frame f? just returning will rexmit. */
816 spin_unlock_irqrestore(&d->lock, flags);
819 memcpy(f->bufaddr, ahin+1, n);
820 case ATA_CMD_PIO_WRITE:
821 case ATA_CMD_PIO_WRITE_EXT:
822 ifp = getif(t, skb->dev);
836 if (skb->len - sizeof *hin - sizeof *ahin < 512) {
838 "aoe: runt data size in ataid. skb->len=%d\n",
840 spin_unlock_irqrestore(&d->lock, flags);
843 ataid_complete(d, t, (char *) (ahin+1));
847 "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
849 get_unaligned_be16(&hin->major),
854 if (buf && --buf->nframesout == 0 && buf->resid == 0) {
855 diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
856 n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
857 bio_endio(buf->bio, n);
858 mempool_free(buf, d->bufpool);
867 __skb_queue_head_init(&queue);
868 skb_queue_splice_init(&d->sendq, &queue);
870 spin_unlock_irqrestore(&d->lock, flags);
875 aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
877 struct sk_buff_head queue;
879 __skb_queue_head_init(&queue);
880 aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
885 aoecmd_ata_id(struct aoedev *d)
888 struct aoe_atahdr *ah;
899 /* initialize the headers & frame */
901 h = (struct aoe_hdr *) skb_mac_header(skb);
902 ah = (struct aoe_atahdr *) (h+1);
903 skb_put(skb, sizeof *h + sizeof *ah);
904 memset(h, 0, skb->len);
905 f->tag = aoehdr_atainit(d, t, h);
909 /* set up ata header */
911 ah->cmdstat = ATA_CMD_ID_ATA;
914 skb->dev = t->ifp->nd;
916 d->rttavg = MAXTIMER;
917 d->timer.function = rexmit_timer;
919 return skb_clone(skb, GFP_ATOMIC);
922 static struct aoetgt *
923 addtgt(struct aoedev *d, char *addr, ulong nframes)
925 struct aoetgt *t, **tt, **te;
930 for (; tt < te && *tt; tt++)
935 "aoe: device addtgt failure; too many targets\n");
938 t = kcalloc(1, sizeof *t, GFP_ATOMIC);
939 f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
943 printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
947 t->nframes = nframes;
952 memcpy(t->addr, addr, sizeof t->addr);
954 t->maxout = t->nframes;
959 aoecmd_cfg_rsp(struct sk_buff *skb)
963 struct aoe_cfghdr *ch;
966 ulong flags, sysminor, aoemajor;
970 h = (struct aoe_hdr *) skb_mac_header(skb);
971 ch = (struct aoe_cfghdr *) (h+1);
974 * Enough people have their dip switches set backwards to
975 * warrant a loud message for this special case.
977 aoemajor = get_unaligned_be16(&h->major);
978 if (aoemajor == 0xfff) {
979 printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
980 "Check shelf dip switches.\n");
984 sysminor = SYSMINOR(aoemajor, h->minor);
985 if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
986 printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
987 aoemajor, (int) h->minor);
991 n = be16_to_cpu(ch->bufcnt);
992 if (n > aoe_maxout) /* keep it reasonable */
995 d = aoedev_by_sysminor_m(sysminor);
997 printk(KERN_INFO "aoe: device sysminor_m failure\n");
1001 spin_lock_irqsave(&d->lock, flags);
1003 t = gettgt(d, h->src);
1005 t = addtgt(d, h->src, n);
1007 spin_unlock_irqrestore(&d->lock, flags);
1011 ifp = getif(t, skb->dev);
1013 ifp = addif(t, skb->dev);
1016 "aoe: device addif failure; "
1017 "too many interfaces?\n");
1018 spin_unlock_irqrestore(&d->lock, flags);
1024 n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
1028 n = n ? n * 512 : DEFAULTBCNT;
1029 if (n != ifp->maxbcnt) {
1031 "aoe: e%ld.%d: setting %d%s%s:%pm\n",
1032 d->aoemajor, d->aoeminor, n,
1033 " byte data frames on ", ifp->nd->name,
1039 /* don't change users' perspective */
1041 spin_unlock_irqrestore(&d->lock, flags);
1044 d->fw_ver = be16_to_cpu(ch->fwver);
1046 sl = aoecmd_ata_id(d);
1048 spin_unlock_irqrestore(&d->lock, flags);
1051 struct sk_buff_head queue;
1052 __skb_queue_head_init(&queue);
1053 __skb_queue_tail(&queue, sl);
1054 aoenet_xmit(&queue);
1059 aoecmd_cleanslate(struct aoedev *d)
1061 struct aoetgt **t, **te;
1062 struct aoeif *p, *e;
1064 d->mintimer = MINTIMER;
1068 for (; t < te && *t; t++) {
1069 (*t)->maxout = (*t)->nframes;
1072 for (; p < e; p++) {
1075 p->maxbcnt = DEFAULTBCNT;