Merge master.kernel.org:/pub/scm/linux/kernel/git/wim/linux-2.6-watchdog
[linux-2.6] / drivers / infiniband / hw / ipath / ipath_driver.c
1 /*
2  * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/spinlock.h>
34 #include <linux/idr.h>
35 #include <linux/pci.h>
36 #include <linux/delay.h>
37 #include <linux/netdevice.h>
38 #include <linux/vmalloc.h>
39
40 #include "ipath_kernel.h"
41 #include "ips_common.h"
42 #include "ipath_layer.h"
43
44 static void ipath_update_pio_bufs(struct ipath_devdata *);
45
46 const char *ipath_get_unit_name(int unit)
47 {
48         static char iname[16];
49         snprintf(iname, sizeof iname, "infinipath%u", unit);
50         return iname;
51 }
52
53 EXPORT_SYMBOL_GPL(ipath_get_unit_name);
54
55 #define DRIVER_LOAD_MSG "PathScale " IPATH_DRV_NAME " loaded: "
56 #define PFX IPATH_DRV_NAME ": "
57
58 /*
59  * The size has to be longer than this string, so we can append
60  * board/chip information to it in the init code.
61  */
62 const char ipath_core_version[] = IPATH_IDSTR "\n";
63
64 static struct idr unit_table;
65 DEFINE_SPINLOCK(ipath_devs_lock);
66 LIST_HEAD(ipath_dev_list);
67
68 wait_queue_head_t ipath_sma_state_wait;
69
70 unsigned ipath_debug = __IPATH_INFO;
71
72 module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
73 MODULE_PARM_DESC(debug, "mask for debug prints");
74 EXPORT_SYMBOL_GPL(ipath_debug);
75
76 MODULE_LICENSE("GPL");
77 MODULE_AUTHOR("PathScale <support@pathscale.com>");
78 MODULE_DESCRIPTION("Pathscale InfiniPath driver");
79
80 const char *ipath_ibcstatus_str[] = {
81         "Disabled",
82         "LinkUp",
83         "PollActive",
84         "PollQuiet",
85         "SleepDelay",
86         "SleepQuiet",
87         "LState6",              /* unused */
88         "LState7",              /* unused */
89         "CfgDebounce",
90         "CfgRcvfCfg",
91         "CfgWaitRmt",
92         "CfgIdle",
93         "RecovRetrain",
94         "LState0xD",            /* unused */
95         "RecovWaitRmt",
96         "RecovIdle",
97 };
98
99 /*
100  * These variables are initialized in the chip-specific files
101  * but are defined here.
102  */
103 u16 ipath_gpio_sda_num, ipath_gpio_scl_num;
104 u64 ipath_gpio_sda, ipath_gpio_scl;
105 u64 infinipath_i_bitsextant;
106 ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
107 u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
108
109 static void __devexit ipath_remove_one(struct pci_dev *);
110 static int __devinit ipath_init_one(struct pci_dev *,
111                                     const struct pci_device_id *);
112
113 /* Only needed for registration, nothing else needs this info */
114 #define PCI_VENDOR_ID_PATHSCALE 0x1fc1
115 #define PCI_DEVICE_ID_INFINIPATH_HT 0xd
116 #define PCI_DEVICE_ID_INFINIPATH_PE800 0x10
117
118 static const struct pci_device_id ipath_pci_tbl[] = {
119         { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
120         { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_PE800) },
121         { 0, }
122 };
123
124 MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
125
126 static struct pci_driver ipath_driver = {
127         .name = IPATH_DRV_NAME,
128         .probe = ipath_init_one,
129         .remove = __devexit_p(ipath_remove_one),
130         .id_table = ipath_pci_tbl,
131 };
132
133 /*
134  * This is where port 0's rcvhdrtail register is written back; we also
135  * want nothing else sharing the cache line, so make it a cache line
136  * in size.  Used for all units.
137  */
138 volatile __le64 *ipath_port0_rcvhdrtail;
139 dma_addr_t ipath_port0_rcvhdrtail_dma;
140 static int port0_rcvhdrtail_refs;
141
142 static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
143                              u32 *bar0, u32 *bar1)
144 {
145         int ret;
146
147         ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
148         if (ret)
149                 ipath_dev_err(dd, "failed to read bar0 before enable: "
150                               "error %d\n", -ret);
151
152         ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
153         if (ret)
154                 ipath_dev_err(dd, "failed to read bar1 before enable: "
155                               "error %d\n", -ret);
156
157         ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
158 }
159
160 static void ipath_free_devdata(struct pci_dev *pdev,
161                                struct ipath_devdata *dd)
162 {
163         unsigned long flags;
164
165         pci_set_drvdata(pdev, NULL);
166
167         if (dd->ipath_unit != -1) {
168                 spin_lock_irqsave(&ipath_devs_lock, flags);
169                 idr_remove(&unit_table, dd->ipath_unit);
170                 list_del(&dd->ipath_list);
171                 spin_unlock_irqrestore(&ipath_devs_lock, flags);
172         }
173         dma_free_coherent(&pdev->dev, sizeof(*dd), dd, dd->ipath_dma_addr);
174 }
175
176 static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
177 {
178         unsigned long flags;
179         struct ipath_devdata *dd;
180         dma_addr_t dma_addr;
181         int ret;
182
183         if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
184                 dd = ERR_PTR(-ENOMEM);
185                 goto bail;
186         }
187
188         dd = dma_alloc_coherent(&pdev->dev, sizeof(*dd), &dma_addr,
189                                 GFP_KERNEL);
190
191         if (!dd) {
192                 dd = ERR_PTR(-ENOMEM);
193                 goto bail;
194         }
195
196         dd->ipath_dma_addr = dma_addr;
197         dd->ipath_unit = -1;
198
199         spin_lock_irqsave(&ipath_devs_lock, flags);
200
201         ret = idr_get_new(&unit_table, dd, &dd->ipath_unit);
202         if (ret < 0) {
203                 printk(KERN_ERR IPATH_DRV_NAME
204                        ": Could not allocate unit ID: error %d\n", -ret);
205                 ipath_free_devdata(pdev, dd);
206                 dd = ERR_PTR(ret);
207                 goto bail_unlock;
208         }
209
210         dd->pcidev = pdev;
211         pci_set_drvdata(pdev, dd);
212
213         list_add(&dd->ipath_list, &ipath_dev_list);
214
215 bail_unlock:
216         spin_unlock_irqrestore(&ipath_devs_lock, flags);
217
218 bail:
219         return dd;
220 }
221
222 static inline struct ipath_devdata *__ipath_lookup(int unit)
223 {
224         return idr_find(&unit_table, unit);
225 }
226
227 struct ipath_devdata *ipath_lookup(int unit)
228 {
229         struct ipath_devdata *dd;
230         unsigned long flags;
231
232         spin_lock_irqsave(&ipath_devs_lock, flags);
233         dd = __ipath_lookup(unit);
234         spin_unlock_irqrestore(&ipath_devs_lock, flags);
235
236         return dd;
237 }
238
239 int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp)
240 {
241         int nunits, npresent, nup;
242         struct ipath_devdata *dd;
243         unsigned long flags;
244         u32 maxports;
245
246         nunits = npresent = nup = maxports = 0;
247
248         spin_lock_irqsave(&ipath_devs_lock, flags);
249
250         list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
251                 nunits++;
252                 if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
253                         npresent++;
254                 if (dd->ipath_lid &&
255                     !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
256                                          | IPATH_LINKUNK)))
257                         nup++;
258                 if (dd->ipath_cfgports > maxports)
259                         maxports = dd->ipath_cfgports;
260         }
261
262         spin_unlock_irqrestore(&ipath_devs_lock, flags);
263
264         if (npresentp)
265                 *npresentp = npresent;
266         if (nupp)
267                 *nupp = nup;
268         if (maxportsp)
269                 *maxportsp = maxports;
270
271         return nunits;
272 }
273
274 static int init_port0_rcvhdrtail(struct pci_dev *pdev)
275 {
276         int ret;
277
278         mutex_lock(&ipath_mutex);
279
280         if (!ipath_port0_rcvhdrtail) {
281                 ipath_port0_rcvhdrtail =
282                         dma_alloc_coherent(&pdev->dev,
283                                            IPATH_PORT0_RCVHDRTAIL_SIZE,
284                                            &ipath_port0_rcvhdrtail_dma,
285                                            GFP_KERNEL);
286
287                 if (!ipath_port0_rcvhdrtail) {
288                         ret = -ENOMEM;
289                         goto bail;
290                 }
291         }
292         port0_rcvhdrtail_refs++;
293         ret = 0;
294
295 bail:
296         mutex_unlock(&ipath_mutex);
297
298         return ret;
299 }
300
301 static void cleanup_port0_rcvhdrtail(struct pci_dev *pdev)
302 {
303         mutex_lock(&ipath_mutex);
304
305         if (!--port0_rcvhdrtail_refs) {
306                 dma_free_coherent(&pdev->dev, IPATH_PORT0_RCVHDRTAIL_SIZE,
307                                   (void *) ipath_port0_rcvhdrtail,
308                                   ipath_port0_rcvhdrtail_dma);
309                 ipath_port0_rcvhdrtail = NULL;
310         }
311
312         mutex_unlock(&ipath_mutex);
313 }
314
315 /*
316  * These next two routines are placeholders in case we don't have per-arch
317  * code for controlling write combining.  If explicit control of write
318  * combining is not available, performance will probably be awful.
319  */
320
321 int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
322 {
323         return -EOPNOTSUPP;
324 }
325
326 void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
327 {
328 }
329
330 static int __devinit ipath_init_one(struct pci_dev *pdev,
331                                     const struct pci_device_id *ent)
332 {
333         int ret, len, j;
334         struct ipath_devdata *dd;
335         unsigned long long addr;
336         u32 bar0 = 0, bar1 = 0;
337         u8 rev;
338
339         ret = init_port0_rcvhdrtail(pdev);
340         if (ret < 0) {
341                 printk(KERN_ERR IPATH_DRV_NAME
342                        ": Could not allocate port0_rcvhdrtail: error %d\n",
343                        -ret);
344                 goto bail;
345         }
346
347         dd = ipath_alloc_devdata(pdev);
348         if (IS_ERR(dd)) {
349                 ret = PTR_ERR(dd);
350                 printk(KERN_ERR IPATH_DRV_NAME
351                        ": Could not allocate devdata: error %d\n", -ret);
352                 goto bail_rcvhdrtail;
353         }
354
355         ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
356
357         read_bars(dd, pdev, &bar0, &bar1);
358
359         ret = pci_enable_device(pdev);
360         if (ret) {
361                 /* This can happen iff:
362                  *
363                  * We did a chip reset, and then failed to reprogram the
364                  * BAR, or the chip reset due to an internal error.  We then
365                  * unloaded the driver and reloaded it.
366                  *
367                  * Both reset cases set the BAR back to initial state.  For
368                  * the latter case, the AER sticky error bit at offset 0x718
369                  * should be set, but the Linux kernel doesn't yet know
370                  * about that, it appears.  If the original BAR was retained
371                  * in the kernel data structures, this may be OK.
372                  */
373                 ipath_dev_err(dd, "enable unit %d failed: error %d\n",
374                               dd->ipath_unit, -ret);
375                 goto bail_devdata;
376         }
377         addr = pci_resource_start(pdev, 0);
378         len = pci_resource_len(pdev, 0);
379         ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %x, vend %x/%x "
380                    "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
381                    ent->device, ent->driver_data);
382
383         read_bars(dd, pdev, &bar0, &bar1);
384
385         if (!bar1 && !(bar0 & ~0xf)) {
386                 if (addr) {
387                         dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
388                                  "rewriting as %llx\n", addr);
389                         ret = pci_write_config_dword(
390                                 pdev, PCI_BASE_ADDRESS_0, addr);
391                         if (ret) {
392                                 ipath_dev_err(dd, "rewrite of BAR0 "
393                                               "failed: err %d\n", -ret);
394                                 goto bail_disable;
395                         }
396                         ret = pci_write_config_dword(
397                                 pdev, PCI_BASE_ADDRESS_1, addr >> 32);
398                         if (ret) {
399                                 ipath_dev_err(dd, "rewrite of BAR1 "
400                                               "failed: err %d\n", -ret);
401                                 goto bail_disable;
402                         }
403                 } else {
404                         ipath_dev_err(dd, "BAR is 0 (probable RESET), "
405                                       "not usable until reboot\n");
406                         ret = -ENODEV;
407                         goto bail_disable;
408                 }
409         }
410
411         ret = pci_request_regions(pdev, IPATH_DRV_NAME);
412         if (ret) {
413                 dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
414                          "err %d\n", dd->ipath_unit, -ret);
415                 goto bail_disable;
416         }
417
418         ret = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
419         if (ret) {
420                 /*
421                  * if the 64 bit setup fails, try 32 bit.  Some systems
422                  * do not setup 64 bit maps on systems with 2GB or less
423                  * memory installed.
424                  */
425                 ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
426                 if (ret) {
427                         dev_info(&pdev->dev, "pci_set_dma_mask unit %u "
428                                  "fails: %d\n", dd->ipath_unit, ret);
429                         goto bail_regions;
430                 }
431                 else
432                         ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
433         }
434
435         pci_set_master(pdev);
436
437         /*
438          * Save BARs to rewrite after device reset.  Save all 64 bits of
439          * BAR, just in case.
440          */
441         dd->ipath_pcibar0 = addr;
442         dd->ipath_pcibar1 = addr >> 32;
443         dd->ipath_deviceid = ent->device;       /* save for later use */
444         dd->ipath_vendorid = ent->vendor;
445
446         /* setup the chip-specific functions, as early as possible. */
447         switch (ent->device) {
448         case PCI_DEVICE_ID_INFINIPATH_HT:
449                 ipath_init_ht400_funcs(dd);
450                 break;
451         case PCI_DEVICE_ID_INFINIPATH_PE800:
452                 ipath_init_pe800_funcs(dd);
453                 break;
454         default:
455                 ipath_dev_err(dd, "Found unknown PathScale deviceid 0x%x, "
456                               "failing\n", ent->device);
457                 return -ENODEV;
458         }
459
460         for (j = 0; j < 6; j++) {
461                 if (!pdev->resource[j].start)
462                         continue;
463                 ipath_cdbg(VERBOSE, "BAR %d start %lx, end %lx, len %lx\n",
464                            j, pdev->resource[j].start,
465                            pdev->resource[j].end,
466                            pci_resource_len(pdev, j));
467         }
468
469         if (!addr) {
470                 ipath_dev_err(dd, "No valid address in BAR 0!\n");
471                 ret = -ENODEV;
472                 goto bail_regions;
473         }
474
475         dd->ipath_deviceid = ent->device;       /* save for later use */
476         dd->ipath_vendorid = ent->vendor;
477
478         ret = pci_read_config_byte(pdev, PCI_REVISION_ID, &rev);
479         if (ret) {
480                 ipath_dev_err(dd, "Failed to read PCI revision ID unit "
481                               "%u: err %d\n", dd->ipath_unit, -ret);
482                 goto bail_regions;      /* shouldn't ever happen */
483         }
484         dd->ipath_pcirev = rev;
485
486         dd->ipath_kregbase = ioremap_nocache(addr, len);
487
488         if (!dd->ipath_kregbase) {
489                 ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
490                           addr);
491                 ret = -ENOMEM;
492                 goto bail_iounmap;
493         }
494         dd->ipath_kregend = (u64 __iomem *)
495                 ((void __iomem *)dd->ipath_kregbase + len);
496         dd->ipath_physaddr = addr;      /* used for io_remap, etc. */
497         /* for user mmap */
498         dd->ipath_kregvirt = (u64 __iomem *) phys_to_virt(addr);
499         ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p "
500                    "kregvirt %p\n", addr, dd->ipath_kregbase,
501                    dd->ipath_kregvirt);
502
503         /*
504          * clear ipath_flags here instead of in ipath_init_chip as it is set
505          * by ipath_setup_htconfig.
506          */
507         dd->ipath_flags = 0;
508
509         if (dd->ipath_f_bus(dd, pdev))
510                 ipath_dev_err(dd, "Failed to setup config space; "
511                               "continuing anyway\n");
512
513         /*
514          * set up our interrupt handler; SA_SHIRQ probably not needed,
515          * since MSI interrupts shouldn't be shared but won't  hurt for now.
516          * check 0 irq after we return from chip-specific bus setup, since
517          * that can affect this due to setup
518          */
519         if (!pdev->irq)
520                 ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
521                               "work\n");
522         else {
523                 ret = request_irq(pdev->irq, ipath_intr, SA_SHIRQ,
524                                   IPATH_DRV_NAME, dd);
525                 if (ret) {
526                         ipath_dev_err(dd, "Couldn't setup irq handler, "
527                                       "irq=%u: %d\n", pdev->irq, ret);
528                         goto bail_iounmap;
529                 }
530         }
531
532         ret = ipath_init_chip(dd, 0);   /* do the chip-specific init */
533         if (ret)
534                 goto bail_iounmap;
535
536         ret = ipath_enable_wc(dd);
537
538         if (ret) {
539                 ipath_dev_err(dd, "Write combining not enabled "
540                               "(err %d): performance may be poor\n",
541                               -ret);
542                 ret = 0;
543         }
544
545         ipath_device_create_group(&pdev->dev, dd);
546         ipathfs_add_device(dd);
547         ipath_user_add(dd);
548         ipath_layer_add(dd);
549
550         goto bail;
551
552 bail_iounmap:
553         iounmap((volatile void __iomem *) dd->ipath_kregbase);
554
555 bail_regions:
556         pci_release_regions(pdev);
557
558 bail_disable:
559         pci_disable_device(pdev);
560
561 bail_devdata:
562         ipath_free_devdata(pdev, dd);
563
564 bail_rcvhdrtail:
565         cleanup_port0_rcvhdrtail(pdev);
566
567 bail:
568         return ret;
569 }
570
571 static void __devexit ipath_remove_one(struct pci_dev *pdev)
572 {
573         struct ipath_devdata *dd;
574
575         ipath_cdbg(VERBOSE, "removing, pdev=%p\n", pdev);
576         if (!pdev)
577                 return;
578
579         dd = pci_get_drvdata(pdev);
580         ipath_layer_del(dd);
581         ipath_user_del(dd);
582         ipathfs_remove_device(dd);
583         ipath_device_remove_group(&pdev->dev, dd);
584         ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
585                    "unit %u\n", dd, (u32) dd->ipath_unit);
586         if (dd->ipath_kregbase) {
587                 ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n",
588                            dd->ipath_kregbase);
589                 iounmap((volatile void __iomem *) dd->ipath_kregbase);
590                 dd->ipath_kregbase = NULL;
591         }
592         pci_release_regions(pdev);
593         ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
594         pci_disable_device(pdev);
595
596         ipath_free_devdata(pdev, dd);
597         cleanup_port0_rcvhdrtail(pdev);
598 }
599
600 /* general driver use */
601 DEFINE_MUTEX(ipath_mutex);
602
603 static DEFINE_SPINLOCK(ipath_pioavail_lock);
604
605 /**
606  * ipath_disarm_piobufs - cancel a range of PIO buffers
607  * @dd: the infinipath device
608  * @first: the first PIO buffer to cancel
609  * @cnt: the number of PIO buffers to cancel
610  *
611  * cancel a range of PIO buffers, used when they might be armed, but
612  * not triggered.  Used at init to ensure buffer state, and also user
613  * process close, in case it died while writing to a PIO buffer
614  * Also after errors.
615  */
616 void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
617                           unsigned cnt)
618 {
619         unsigned i, last = first + cnt;
620         u64 sendctrl, sendorig;
621
622         ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
623         sendorig = dd->ipath_sendctrl | INFINIPATH_S_DISARM;
624         for (i = first; i < last; i++) {
625                 sendctrl = sendorig |
626                         (i << INFINIPATH_S_DISARMPIOBUF_SHIFT);
627                 ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
628                                  sendctrl);
629         }
630
631         /*
632          * Write it again with current value, in case ipath_sendctrl changed
633          * while we were looping; no critical bits that would require
634          * locking.
635          *
636          * Write a 0, and then the original value, reading scratch in
637          * between.  This seems to avoid a chip timing race that causes
638          * pioavail updates to memory to stop.
639          */
640         ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
641                          0);
642         sendorig = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
643         ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
644                          dd->ipath_sendctrl);
645 }
646
647 /**
648  * ipath_wait_linkstate - wait for an IB link state change to occur
649  * @dd: the infinipath device
650  * @state: the state to wait for
651  * @msecs: the number of milliseconds to wait
652  *
653  * wait up to msecs milliseconds for IB link state change to occur for
654  * now, take the easy polling route.  Currently used only by
655  * ipath_layer_set_linkstate.  Returns 0 if state reached, otherwise
656  * -ETIMEDOUT state can have multiple states set, for any of several
657  * transitions.
658  */
659 int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
660 {
661         dd->ipath_sma_state_wanted = state;
662         wait_event_interruptible_timeout(ipath_sma_state_wait,
663                                          (dd->ipath_flags & state),
664                                          msecs_to_jiffies(msecs));
665         dd->ipath_sma_state_wanted = 0;
666
667         if (!(dd->ipath_flags & state)) {
668                 u64 val;
669                 ipath_cdbg(SMA, "Didn't reach linkstate %s within %u ms\n",
670                            /* test INIT ahead of DOWN, both can be set */
671                            (state & IPATH_LINKINIT) ? "INIT" :
672                            ((state & IPATH_LINKDOWN) ? "DOWN" :
673                             ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
674                            msecs);
675                 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
676                 ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
677                            (unsigned long long) ipath_read_kreg64(
678                                    dd, dd->ipath_kregs->kr_ibcctrl),
679                            (unsigned long long) val,
680                            ipath_ibcstatus_str[val & 0xf]);
681         }
682         return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
683 }
684
685 void ipath_decode_err(char *buf, size_t blen, ipath_err_t err)
686 {
687         *buf = '\0';
688         if (err & INFINIPATH_E_RHDRLEN)
689                 strlcat(buf, "rhdrlen ", blen);
690         if (err & INFINIPATH_E_RBADTID)
691                 strlcat(buf, "rbadtid ", blen);
692         if (err & INFINIPATH_E_RBADVERSION)
693                 strlcat(buf, "rbadversion ", blen);
694         if (err & INFINIPATH_E_RHDR)
695                 strlcat(buf, "rhdr ", blen);
696         if (err & INFINIPATH_E_RLONGPKTLEN)
697                 strlcat(buf, "rlongpktlen ", blen);
698         if (err & INFINIPATH_E_RSHORTPKTLEN)
699                 strlcat(buf, "rshortpktlen ", blen);
700         if (err & INFINIPATH_E_RMAXPKTLEN)
701                 strlcat(buf, "rmaxpktlen ", blen);
702         if (err & INFINIPATH_E_RMINPKTLEN)
703                 strlcat(buf, "rminpktlen ", blen);
704         if (err & INFINIPATH_E_RFORMATERR)
705                 strlcat(buf, "rformaterr ", blen);
706         if (err & INFINIPATH_E_RUNSUPVL)
707                 strlcat(buf, "runsupvl ", blen);
708         if (err & INFINIPATH_E_RUNEXPCHAR)
709                 strlcat(buf, "runexpchar ", blen);
710         if (err & INFINIPATH_E_RIBFLOW)
711                 strlcat(buf, "ribflow ", blen);
712         if (err & INFINIPATH_E_REBP)
713                 strlcat(buf, "EBP ", blen);
714         if (err & INFINIPATH_E_SUNDERRUN)
715                 strlcat(buf, "sunderrun ", blen);
716         if (err & INFINIPATH_E_SPIOARMLAUNCH)
717                 strlcat(buf, "spioarmlaunch ", blen);
718         if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
719                 strlcat(buf, "sunexperrpktnum ", blen);
720         if (err & INFINIPATH_E_SDROPPEDDATAPKT)
721                 strlcat(buf, "sdroppeddatapkt ", blen);
722         if (err & INFINIPATH_E_SDROPPEDSMPPKT)
723                 strlcat(buf, "sdroppedsmppkt ", blen);
724         if (err & INFINIPATH_E_SMAXPKTLEN)
725                 strlcat(buf, "smaxpktlen ", blen);
726         if (err & INFINIPATH_E_SMINPKTLEN)
727                 strlcat(buf, "sminpktlen ", blen);
728         if (err & INFINIPATH_E_SUNSUPVL)
729                 strlcat(buf, "sunsupVL ", blen);
730         if (err & INFINIPATH_E_SPKTLEN)
731                 strlcat(buf, "spktlen ", blen);
732         if (err & INFINIPATH_E_INVALIDADDR)
733                 strlcat(buf, "invalidaddr ", blen);
734         if (err & INFINIPATH_E_RICRC)
735                 strlcat(buf, "CRC ", blen);
736         if (err & INFINIPATH_E_RVCRC)
737                 strlcat(buf, "VCRC ", blen);
738         if (err & INFINIPATH_E_RRCVEGRFULL)
739                 strlcat(buf, "rcvegrfull ", blen);
740         if (err & INFINIPATH_E_RRCVHDRFULL)
741                 strlcat(buf, "rcvhdrfull ", blen);
742         if (err & INFINIPATH_E_IBSTATUSCHANGED)
743                 strlcat(buf, "ibcstatuschg ", blen);
744         if (err & INFINIPATH_E_RIBLOSTLINK)
745                 strlcat(buf, "riblostlink ", blen);
746         if (err & INFINIPATH_E_HARDWARE)
747                 strlcat(buf, "hardware ", blen);
748         if (err & INFINIPATH_E_RESET)
749                 strlcat(buf, "reset ", blen);
750 }
751
752 /**
753  * get_rhf_errstring - decode RHF errors
754  * @err: the err number
755  * @msg: the output buffer
756  * @len: the length of the output buffer
757  *
758  * only used one place now, may want more later
759  */
760 static void get_rhf_errstring(u32 err, char *msg, size_t len)
761 {
762         /* if no errors, and so don't need to check what's first */
763         *msg = '\0';
764
765         if (err & INFINIPATH_RHF_H_ICRCERR)
766                 strlcat(msg, "icrcerr ", len);
767         if (err & INFINIPATH_RHF_H_VCRCERR)
768                 strlcat(msg, "vcrcerr ", len);
769         if (err & INFINIPATH_RHF_H_PARITYERR)
770                 strlcat(msg, "parityerr ", len);
771         if (err & INFINIPATH_RHF_H_LENERR)
772                 strlcat(msg, "lenerr ", len);
773         if (err & INFINIPATH_RHF_H_MTUERR)
774                 strlcat(msg, "mtuerr ", len);
775         if (err & INFINIPATH_RHF_H_IHDRERR)
776                 /* infinipath hdr checksum error */
777                 strlcat(msg, "ipathhdrerr ", len);
778         if (err & INFINIPATH_RHF_H_TIDERR)
779                 strlcat(msg, "tiderr ", len);
780         if (err & INFINIPATH_RHF_H_MKERR)
781                 /* bad port, offset, etc. */
782                 strlcat(msg, "invalid ipathhdr ", len);
783         if (err & INFINIPATH_RHF_H_IBERR)
784                 strlcat(msg, "iberr ", len);
785         if (err & INFINIPATH_RHF_L_SWA)
786                 strlcat(msg, "swA ", len);
787         if (err & INFINIPATH_RHF_L_SWB)
788                 strlcat(msg, "swB ", len);
789 }
790
791 /**
792  * ipath_get_egrbuf - get an eager buffer
793  * @dd: the infinipath device
794  * @bufnum: the eager buffer to get
795  * @err: unused
796  *
797  * must only be called if ipath_pd[port] is known to be allocated
798  */
799 static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum,
800                                      int err)
801 {
802         return dd->ipath_port0_skbs ?
803                 (void *)dd->ipath_port0_skbs[bufnum]->data : NULL;
804 }
805
806 /**
807  * ipath_alloc_skb - allocate an skb and buffer with possible constraints
808  * @dd: the infinipath device
809  * @gfp_mask: the sk_buff SFP mask
810  */
811 struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
812                                 gfp_t gfp_mask)
813 {
814         struct sk_buff *skb;
815         u32 len;
816
817         /*
818          * Only fully supported way to handle this is to allocate lots
819          * extra, align as needed, and then do skb_reserve().  That wastes
820          * a lot of memory...  I'll have to hack this into infinipath_copy
821          * also.
822          */
823
824         /*
825          * We need 4 extra bytes for unaligned transfer copying
826          */
827         if (dd->ipath_flags & IPATH_4BYTE_TID) {
828                 /* we need a 4KB multiple alignment, and there is no way
829                  * to do it except to allocate extra and then skb_reserve
830                  * enough to bring it up to the right alignment.
831                  */
832                 len = dd->ipath_ibmaxlen + 4 + (1 << 11) - 1;
833         }
834         else
835                 len = dd->ipath_ibmaxlen + 4;
836         skb = __dev_alloc_skb(len, gfp_mask);
837         if (!skb) {
838                 ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
839                               len);
840                 goto bail;
841         }
842         if (dd->ipath_flags & IPATH_4BYTE_TID) {
843                 u32 una = ((1 << 11) - 1) & (unsigned long)(skb->data + 4);
844                 if (una)
845                         skb_reserve(skb, 4 + (1 << 11) - una);
846                 else
847                         skb_reserve(skb, 4);
848         } else
849                 skb_reserve(skb, 4);
850
851 bail:
852         return skb;
853 }
854
855 /**
856  * ipath_rcv_layer - receive a packet for the layered (ethernet) driver
857  * @dd: the infinipath device
858  * @etail: the sk_buff number
859  * @tlen: the total packet length
860  * @hdr: the ethernet header
861  *
862  * Separate routine for better overall optimization
863  */
864 static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail,
865                             u32 tlen, struct ether_header *hdr)
866 {
867         u32 elen;
868         u8 pad, *bthbytes;
869         struct sk_buff *skb, *nskb;
870
871         if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) {
872                 /*
873                  * Allocate a new sk_buff to replace the one we give
874                  * to the network stack.
875                  */
876                 nskb = ipath_alloc_skb(dd, GFP_ATOMIC);
877                 if (!nskb) {
878                         /* count OK packets that we drop */
879                         ipath_stats.sps_krdrops++;
880                         return;
881                 }
882
883                 bthbytes = (u8 *) hdr->bth;
884                 pad = (bthbytes[1] >> 4) & 3;
885                 /* +CRC32 */
886                 elen = tlen - (sizeof(*hdr) + pad + sizeof(u32));
887
888                 skb = dd->ipath_port0_skbs[etail];
889                 dd->ipath_port0_skbs[etail] = nskb;
890                 skb_put(skb, elen);
891
892                 dd->ipath_f_put_tid(dd, etail + (u64 __iomem *)
893                                     ((char __iomem *) dd->ipath_kregbase
894                                      + dd->ipath_rcvegrbase), 0,
895                                     virt_to_phys(nskb->data));
896
897                 __ipath_layer_rcv(dd, hdr, skb);
898
899                 /* another ether packet received */
900                 ipath_stats.sps_ether_rpkts++;
901         }
902         else if (hdr->sub_opcode == OPCODE_LID_ARP)
903                 __ipath_layer_rcv_lid(dd, hdr);
904 }
905
906 /*
907  * ipath_kreceive - receive a packet
908  * @dd: the infinipath device
909  *
910  * called from interrupt handler for errors or receive interrupt
911  */
912 void ipath_kreceive(struct ipath_devdata *dd)
913 {
914         u64 *rc;
915         void *ebuf;
916         const u32 rsize = dd->ipath_rcvhdrentsize;      /* words */
917         const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
918         u32 etail = -1, l, hdrqtail;
919         struct ips_message_header *hdr;
920         u32 eflags, i, etype, tlen, pkttot = 0;
921         static u64 totcalls;    /* stats, may eventually remove */
922         char emsg[128];
923
924         if (!dd->ipath_hdrqtailptr) {
925                 ipath_dev_err(dd,
926                               "hdrqtailptr not set, can't do receives\n");
927                 goto bail;
928         }
929
930         /* There is already a thread processing this queue. */
931         if (test_and_set_bit(0, &dd->ipath_rcv_pending))
932                 goto bail;
933
934         if (dd->ipath_port0head ==
935             (u32)le64_to_cpu(*dd->ipath_hdrqtailptr))
936                 goto done;
937
938 gotmore:
939         /*
940          * read only once at start.  If in flood situation, this helps
941          * performance slightly.  If more arrive while we are processing,
942          * we'll come back here and do them
943          */
944         hdrqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr);
945
946         for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) {
947                 u32 qp;
948                 u8 *bthbytes;
949
950                 rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2));
951                 hdr = (struct ips_message_header *)&rc[1];
952                 /*
953                  * could make a network order version of IPATH_KD_QP, and
954                  * do the obvious shift before masking to speed this up.
955                  */
956                 qp = ntohl(hdr->bth[1]) & 0xffffff;
957                 bthbytes = (u8 *) hdr->bth;
958
959                 eflags = ips_get_hdr_err_flags((__le32 *) rc);
960                 etype = ips_get_rcv_type((__le32 *) rc);
961                 /* total length */
962                 tlen = ips_get_length_in_bytes((__le32 *) rc);
963                 ebuf = NULL;
964                 if (etype != RCVHQ_RCV_TYPE_EXPECTED) {
965                         /*
966                          * it turns out that the chips uses an eager buffer
967                          * for all non-expected packets, whether it "needs"
968                          * one or not.  So always get the index, but don't
969                          * set ebuf (so we try to copy data) unless the
970                          * length requires it.
971                          */
972                         etail = ips_get_index((__le32 *) rc);
973                         if (tlen > sizeof(*hdr) ||
974                             etype == RCVHQ_RCV_TYPE_NON_KD)
975                                 ebuf = ipath_get_egrbuf(dd, etail, 0);
976                 }
977
978                 /*
979                  * both tiderr and ipathhdrerr are set for all plain IB
980                  * packets; only ipathhdrerr should be set.
981                  */
982
983                 if (etype != RCVHQ_RCV_TYPE_NON_KD && etype !=
984                     RCVHQ_RCV_TYPE_ERROR && ips_get_ipath_ver(
985                             hdr->iph.ver_port_tid_offset) !=
986                     IPS_PROTO_VERSION) {
987                         ipath_cdbg(PKT, "Bad InfiniPath protocol version "
988                                    "%x\n", etype);
989                 }
990
991                 if (eflags & ~(INFINIPATH_RHF_H_TIDERR |
992                                INFINIPATH_RHF_H_IHDRERR)) {
993                         get_rhf_errstring(eflags, emsg, sizeof emsg);
994                         ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
995                                    "tlen=%x opcode=%x egridx=%x: %s\n",
996                                    eflags, l, etype, tlen, bthbytes[0],
997                                    ips_get_index((__le32 *) rc), emsg);
998                 } else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
999                                 int ret = __ipath_verbs_rcv(dd, rc + 1,
1000                                                             ebuf, tlen);
1001                                 if (ret == -ENODEV)
1002                                         ipath_cdbg(VERBOSE,
1003                                                    "received IB packet, "
1004                                                    "not SMA (QP=%x)\n", qp);
1005                 } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
1006                         if (qp == IPATH_KD_QP &&
1007                             bthbytes[0] == ipath_layer_rcv_opcode &&
1008                             ebuf)
1009                                 ipath_rcv_layer(dd, etail, tlen,
1010                                                 (struct ether_header *)hdr);
1011                         else
1012                                 ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
1013                                            "qp=%x), len %x; ignored\n",
1014                                            etype, bthbytes[0], qp, tlen);
1015                 }
1016                 else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
1017                         ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
1018                                   be32_to_cpu(hdr->bth[0]) & 0xff);
1019                 else if (eflags & (INFINIPATH_RHF_H_TIDERR |
1020                                    INFINIPATH_RHF_H_IHDRERR)) {
1021                         /*
1022                          * This is a type 3 packet, only the LRH is in the
1023                          * rcvhdrq, the rest of the header is in the eager
1024                          * buffer.
1025                          */
1026                         u8 opcode;
1027                         if (ebuf) {
1028                                 bthbytes = (u8 *) ebuf;
1029                                 opcode = *bthbytes;
1030                         }
1031                         else
1032                                 opcode = 0;
1033                         get_rhf_errstring(eflags, emsg, sizeof emsg);
1034                         ipath_dbg("Err %x (%s), opcode %x, egrbuf %x, "
1035                                   "len %x\n", eflags, emsg, opcode, etail,
1036                                   tlen);
1037                 } else {
1038                         /*
1039                          * error packet, type of error  unknown.
1040                          * Probably type 3, but we don't know, so don't
1041                          * even try to print the opcode, etc.
1042                          */
1043                         ipath_dbg("Error Pkt, but no eflags! egrbuf %x, "
1044                                   "len %x\nhdrq@%lx;hdrq+%x rhf: %llx; "
1045                                   "hdr %llx %llx %llx %llx %llx\n",
1046                                   etail, tlen, (unsigned long) rc, l,
1047                                   (unsigned long long) rc[0],
1048                                   (unsigned long long) rc[1],
1049                                   (unsigned long long) rc[2],
1050                                   (unsigned long long) rc[3],
1051                                   (unsigned long long) rc[4],
1052                                   (unsigned long long) rc[5]);
1053                 }
1054                 l += rsize;
1055                 if (l >= maxcnt)
1056                         l = 0;
1057                 /*
1058                  * update for each packet, to help prevent overflows if we
1059                  * have lots of packets.
1060                  */
1061                 (void)ipath_write_ureg(dd, ur_rcvhdrhead,
1062                                        dd->ipath_rhdrhead_intr_off | l, 0);
1063                 if (etype != RCVHQ_RCV_TYPE_EXPECTED)
1064                         (void)ipath_write_ureg(dd, ur_rcvegrindexhead,
1065                                                etail, 0);
1066         }
1067
1068         pkttot += i;
1069
1070         dd->ipath_port0head = l;
1071
1072         if (hdrqtail != (u32)le64_to_cpu(*dd->ipath_hdrqtailptr))
1073                 /* more arrived while we handled first batch */
1074                 goto gotmore;
1075
1076         if (pkttot > ipath_stats.sps_maxpkts_call)
1077                 ipath_stats.sps_maxpkts_call = pkttot;
1078         ipath_stats.sps_port0pkts += pkttot;
1079         ipath_stats.sps_avgpkts_call =
1080                 ipath_stats.sps_port0pkts / ++totcalls;
1081
1082 done:
1083         clear_bit(0, &dd->ipath_rcv_pending);
1084         smp_mb__after_clear_bit();
1085
1086 bail:;
1087 }
1088
1089 /**
1090  * ipath_update_pio_bufs - update shadow copy of the PIO availability map
1091  * @dd: the infinipath device
1092  *
1093  * called whenever our local copy indicates we have run out of send buffers
1094  * NOTE: This can be called from interrupt context by some code
1095  * and from non-interrupt context by ipath_getpiobuf().
1096  */
1097
1098 static void ipath_update_pio_bufs(struct ipath_devdata *dd)
1099 {
1100         unsigned long flags;
1101         int i;
1102         const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
1103
1104         /* If the generation (check) bits have changed, then we update the
1105          * busy bit for the corresponding PIO buffer.  This algorithm will
1106          * modify positions to the value they already have in some cases
1107          * (i.e., no change), but it's faster than changing only the bits
1108          * that have changed.
1109          *
1110          * We would like to do this atomicly, to avoid spinlocks in the
1111          * critical send path, but that's not really possible, given the
1112          * type of changes, and that this routine could be called on
1113          * multiple cpu's simultaneously, so we lock in this routine only,
1114          * to avoid conflicting updates; all we change is the shadow, and
1115          * it's a single 64 bit memory location, so by definition the update
1116          * is atomic in terms of what other cpu's can see in testing the
1117          * bits.  The spin_lock overhead isn't too bad, since it only
1118          * happens when all buffers are in use, so only cpu overhead, not
1119          * latency or bandwidth is affected.
1120          */
1121 #define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL
1122         if (!dd->ipath_pioavailregs_dma) {
1123                 ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
1124                 return;
1125         }
1126         if (ipath_debug & __IPATH_VERBDBG) {
1127                 /* only if packet debug and verbose */
1128                 volatile __le64 *dma = dd->ipath_pioavailregs_dma;
1129                 unsigned long *shadow = dd->ipath_pioavailshadow;
1130
1131                 ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
1132                            "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
1133                            "s3=%lx\n",
1134                            (unsigned long long) le64_to_cpu(dma[0]),
1135                            shadow[0],
1136                            (unsigned long long) le64_to_cpu(dma[1]),
1137                            shadow[1],
1138                            (unsigned long long) le64_to_cpu(dma[2]),
1139                            shadow[2],
1140                            (unsigned long long) le64_to_cpu(dma[3]),
1141                            shadow[3]);
1142                 if (piobregs > 4)
1143                         ipath_cdbg(
1144                                 PKT, "2nd group, dma4=%llx shad4=%lx, "
1145                                 "d5=%llx s5=%lx, d6=%llx s6=%lx, "
1146                                 "d7=%llx s7=%lx\n",
1147                                 (unsigned long long) le64_to_cpu(dma[4]),
1148                                 shadow[4],
1149                                 (unsigned long long) le64_to_cpu(dma[5]),
1150                                 shadow[5],
1151                                 (unsigned long long) le64_to_cpu(dma[6]),
1152                                 shadow[6],
1153                                 (unsigned long long) le64_to_cpu(dma[7]),
1154                                 shadow[7]);
1155         }
1156         spin_lock_irqsave(&ipath_pioavail_lock, flags);
1157         for (i = 0; i < piobregs; i++) {
1158                 u64 pchbusy, pchg, piov, pnew;
1159                 /*
1160                  * Chip Errata: bug 6641; even and odd qwords>3 are swapped
1161                  */
1162                 if (i > 3) {
1163                         if (i & 1)
1164                                 piov = le64_to_cpu(
1165                                         dd->ipath_pioavailregs_dma[i - 1]);
1166                         else
1167                                 piov = le64_to_cpu(
1168                                         dd->ipath_pioavailregs_dma[i + 1]);
1169                 } else
1170                         piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
1171                 pchg = _IPATH_ALL_CHECKBITS &
1172                         ~(dd->ipath_pioavailshadow[i] ^ piov);
1173                 pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
1174                 if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
1175                         pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
1176                         pnew |= piov & pchbusy;
1177                         dd->ipath_pioavailshadow[i] = pnew;
1178                 }
1179         }
1180         spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1181 }
1182
1183 /**
1184  * ipath_setrcvhdrsize - set the receive header size
1185  * @dd: the infinipath device
1186  * @rhdrsize: the receive header size
1187  *
1188  * called from user init code, and also layered driver init
1189  */
1190 int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
1191 {
1192         int ret = 0;
1193
1194         if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
1195                 if (dd->ipath_rcvhdrsize != rhdrsize) {
1196                         dev_info(&dd->pcidev->dev,
1197                                  "Error: can't set protocol header "
1198                                  "size %u, already %u\n",
1199                                  rhdrsize, dd->ipath_rcvhdrsize);
1200                         ret = -EAGAIN;
1201                 } else
1202                         ipath_cdbg(VERBOSE, "Reuse same protocol header "
1203                                    "size %u\n", dd->ipath_rcvhdrsize);
1204         } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
1205                                (sizeof(u64) / sizeof(u32)))) {
1206                 ipath_dbg("Error: can't set protocol header size %u "
1207                           "(> max %u)\n", rhdrsize,
1208                           dd->ipath_rcvhdrentsize -
1209                           (u32) (sizeof(u64) / sizeof(u32)));
1210                 ret = -EOVERFLOW;
1211         } else {
1212                 dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
1213                 dd->ipath_rcvhdrsize = rhdrsize;
1214                 ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
1215                                  dd->ipath_rcvhdrsize);
1216                 ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
1217                            dd->ipath_rcvhdrsize);
1218         }
1219         return ret;
1220 }
1221
1222 /**
1223  * ipath_getpiobuf - find an available pio buffer
1224  * @dd: the infinipath device
1225  * @pbufnum: the buffer number is placed here
1226  *
1227  * do appropriate marking as busy, etc.
1228  * returns buffer number if one found (>=0), negative number is error.
1229  * Used by ipath_sma_send_pkt and ipath_layer_send
1230  */
1231 u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum)
1232 {
1233         int i, j, starti, updated = 0;
1234         unsigned piobcnt, iter;
1235         unsigned long flags;
1236         unsigned long *shadow = dd->ipath_pioavailshadow;
1237         u32 __iomem *buf;
1238
1239         piobcnt = (unsigned)(dd->ipath_piobcnt2k
1240                              + dd->ipath_piobcnt4k);
1241         starti = dd->ipath_lastport_piobuf;
1242         iter = piobcnt - starti;
1243         if (dd->ipath_upd_pio_shadow) {
1244                 /*
1245                  * Minor optimization.  If we had no buffers on last call,
1246                  * start out by doing the update; continue and do scan even
1247                  * if no buffers were updated, to be paranoid
1248                  */
1249                 ipath_update_pio_bufs(dd);
1250                 /* we scanned here, don't do it at end of scan */
1251                 updated = 1;
1252                 i = starti;
1253         } else
1254                 i = dd->ipath_lastpioindex;
1255
1256 rescan:
1257         /*
1258          * while test_and_set_bit() is atomic, we do that and then the
1259          * change_bit(), and the pair is not.  See if this is the cause
1260          * of the remaining armlaunch errors.
1261          */
1262         spin_lock_irqsave(&ipath_pioavail_lock, flags);
1263         for (j = 0; j < iter; j++, i++) {
1264                 if (i >= piobcnt)
1265                         i = starti;
1266                 /*
1267                  * To avoid bus lock overhead, we first find a candidate
1268                  * buffer, then do the test and set, and continue if that
1269                  * fails.
1270                  */
1271                 if (test_bit((2 * i) + 1, shadow) ||
1272                     test_and_set_bit((2 * i) + 1, shadow))
1273                         continue;
1274                 /* flip generation bit */
1275                 change_bit(2 * i, shadow);
1276                 break;
1277         }
1278         spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1279
1280         if (j == iter) {
1281                 volatile __le64 *dma = dd->ipath_pioavailregs_dma;
1282
1283                 /*
1284                  * first time through; shadow exhausted, but may be real
1285                  * buffers available, so go see; if any updated, rescan
1286                  * (once)
1287                  */
1288                 if (!updated) {
1289                         ipath_update_pio_bufs(dd);
1290                         updated = 1;
1291                         i = starti;
1292                         goto rescan;
1293                 }
1294                 dd->ipath_upd_pio_shadow = 1;
1295                 /*
1296                  * not atomic, but if we lose one once in a while, that's OK
1297                  */
1298                 ipath_stats.sps_nopiobufs++;
1299                 if (!(++dd->ipath_consec_nopiobuf % 100000)) {
1300                         ipath_dbg(
1301                                 "%u pio sends with no bufavail; dmacopy: "
1302                                 "%llx %llx %llx %llx; shadow:  "
1303                                 "%lx %lx %lx %lx\n",
1304                                 dd->ipath_consec_nopiobuf,
1305                                 (unsigned long long) le64_to_cpu(dma[0]),
1306                                 (unsigned long long) le64_to_cpu(dma[1]),
1307                                 (unsigned long long) le64_to_cpu(dma[2]),
1308                                 (unsigned long long) le64_to_cpu(dma[3]),
1309                                 shadow[0], shadow[1], shadow[2],
1310                                 shadow[3]);
1311                         /*
1312                          * 4 buffers per byte, 4 registers above, cover rest
1313                          * below
1314                          */
1315                         if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
1316                             (sizeof(shadow[0]) * 4 * 4))
1317                                 ipath_dbg("2nd group: dmacopy: %llx %llx "
1318                                           "%llx %llx; shadow: %lx %lx "
1319                                           "%lx %lx\n",
1320                                           (unsigned long long)
1321                                           le64_to_cpu(dma[4]),
1322                                           (unsigned long long)
1323                                           le64_to_cpu(dma[5]),
1324                                           (unsigned long long)
1325                                           le64_to_cpu(dma[6]),
1326                                           (unsigned long long)
1327                                           le64_to_cpu(dma[7]),
1328                                           shadow[4], shadow[5],
1329                                           shadow[6], shadow[7]);
1330                 }
1331                 buf = NULL;
1332                 goto bail;
1333         }
1334
1335         if (updated)
1336                 /*
1337                  * ran out of bufs, now some (at least this one we just
1338                  * got) are now available, so tell the layered driver.
1339                  */
1340                 __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE);
1341
1342         /*
1343          * set next starting place.  Since it's just an optimization,
1344          * it doesn't matter who wins on this, so no locking
1345          */
1346         dd->ipath_lastpioindex = i + 1;
1347         if (dd->ipath_upd_pio_shadow)
1348                 dd->ipath_upd_pio_shadow = 0;
1349         if (dd->ipath_consec_nopiobuf)
1350                 dd->ipath_consec_nopiobuf = 0;
1351         if (i < dd->ipath_piobcnt2k)
1352                 buf = (u32 __iomem *) (dd->ipath_pio2kbase +
1353                                        i * dd->ipath_palign);
1354         else
1355                 buf = (u32 __iomem *)
1356                         (dd->ipath_pio4kbase +
1357                          (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
1358         ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
1359                    i, (i < dd->ipath_piobcnt2k) ? 2 : 4, buf);
1360         if (pbufnum)
1361                 *pbufnum = i;
1362
1363 bail:
1364         return buf;
1365 }
1366
1367 /**
1368  * ipath_create_rcvhdrq - create a receive header queue
1369  * @dd: the infinipath device
1370  * @pd: the port data
1371  *
1372  * this *must* be physically contiguous memory, and for now,
1373  * that limits it to what kmalloc can do.
1374  */
1375 int ipath_create_rcvhdrq(struct ipath_devdata *dd,
1376                          struct ipath_portdata *pd)
1377 {
1378         int ret = 0, amt;
1379
1380         amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
1381                     sizeof(u32), PAGE_SIZE);
1382         if (!pd->port_rcvhdrq) {
1383                 /*
1384                  * not using REPEAT isn't viable; at 128KB, we can easily
1385                  * fail this.  The problem with REPEAT is we can block here
1386                  * "forever".  There isn't an inbetween, unfortunately.  We
1387                  * could reduce the risk by never freeing the rcvhdrq except
1388                  * at unload, but even then, the first time a port is used,
1389                  * we could delay for some time...
1390                  */
1391                 gfp_t gfp_flags = GFP_USER | __GFP_COMP;
1392
1393                 pd->port_rcvhdrq = dma_alloc_coherent(
1394                         &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
1395                         gfp_flags);
1396
1397                 if (!pd->port_rcvhdrq) {
1398                         ipath_dev_err(dd, "attempt to allocate %d bytes "
1399                                       "for port %u rcvhdrq failed\n",
1400                                       amt, pd->port_port);
1401                         ret = -ENOMEM;
1402                         goto bail;
1403                 }
1404
1405                 pd->port_rcvhdrq_size = amt;
1406
1407                 ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
1408                            "for port %u rcvhdr Q\n",
1409                            amt >> PAGE_SHIFT, pd->port_rcvhdrq,
1410                            (unsigned long) pd->port_rcvhdrq_phys,
1411                            (unsigned long) pd->port_rcvhdrq_size,
1412                            pd->port_port);
1413         } else {
1414                 /*
1415                  * clear for security, sanity, and/or debugging, each
1416                  * time we reuse
1417                  */
1418                 memset(pd->port_rcvhdrq, 0, amt);
1419         }
1420
1421         /*
1422          * tell chip each time we init it, even if we are re-using previous
1423          * memory (we zero it at process close)
1424          */
1425         ipath_cdbg(VERBOSE, "writing port %d rcvhdraddr as %lx\n",
1426                    pd->port_port, (unsigned long) pd->port_rcvhdrq_phys);
1427         ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
1428                               pd->port_port, pd->port_rcvhdrq_phys);
1429
1430         ret = 0;
1431 bail:
1432         return ret;
1433 }
1434
1435 int ipath_waitfor_complete(struct ipath_devdata *dd, ipath_kreg reg_id,
1436                            u64 bits_to_wait_for, u64 * valp)
1437 {
1438         unsigned long timeout;
1439         u64 lastval, val;
1440         int ret;
1441
1442         lastval = ipath_read_kreg64(dd, reg_id);
1443         /* wait a ridiculously long time */
1444         timeout = jiffies + msecs_to_jiffies(5);
1445         do {
1446                 val = ipath_read_kreg64(dd, reg_id);
1447                 /* set so they have something, even on failures. */
1448                 *valp = val;
1449                 if ((val & bits_to_wait_for) == bits_to_wait_for) {
1450                         ret = 0;
1451                         break;
1452                 }
1453                 if (val != lastval)
1454                         ipath_cdbg(VERBOSE, "Changed from %llx to %llx, "
1455                                    "waiting for %llx bits\n",
1456                                    (unsigned long long) lastval,
1457                                    (unsigned long long) val,
1458                                    (unsigned long long) bits_to_wait_for);
1459                 cond_resched();
1460                 if (time_after(jiffies, timeout)) {
1461                         ipath_dbg("Didn't get bits %llx in register 0x%x, "
1462                                   "got %llx\n",
1463                                   (unsigned long long) bits_to_wait_for,
1464                                   reg_id, (unsigned long long) *valp);
1465                         ret = -ENODEV;
1466                         break;
1467                 }
1468         } while (1);
1469
1470         return ret;
1471 }
1472
1473 /**
1474  * ipath_waitfor_mdio_cmdready - wait for last command to complete
1475  * @dd: the infinipath device
1476  *
1477  * Like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go
1478  * away indicating the last command has completed.  It doesn't return data
1479  */
1480 int ipath_waitfor_mdio_cmdready(struct ipath_devdata *dd)
1481 {
1482         unsigned long timeout;
1483         u64 val;
1484         int ret;
1485
1486         /* wait a ridiculously long time */
1487         timeout = jiffies + msecs_to_jiffies(5);
1488         do {
1489                 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_mdio);
1490                 if (!(val & IPATH_MDIO_CMDVALID)) {
1491                         ret = 0;
1492                         break;
1493                 }
1494                 cond_resched();
1495                 if (time_after(jiffies, timeout)) {
1496                         ipath_dbg("CMDVALID stuck in mdio reg? (%llx)\n",
1497                                   (unsigned long long) val);
1498                         ret = -ENODEV;
1499                         break;
1500                 }
1501         } while (1);
1502
1503         return ret;
1504 }
1505
1506 void ipath_set_ib_lstate(struct ipath_devdata *dd, int which)
1507 {
1508         static const char *what[4] = {
1509                 [0] = "DOWN",
1510                 [INFINIPATH_IBCC_LINKCMD_INIT] = "INIT",
1511                 [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
1512                 [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
1513         };
1514         ipath_cdbg(SMA, "Trying to move unit %u to %s, current ltstate "
1515                    "is %s\n", dd->ipath_unit,
1516                    what[(which >> INFINIPATH_IBCC_LINKCMD_SHIFT) &
1517                         INFINIPATH_IBCC_LINKCMD_MASK],
1518                    ipath_ibcstatus_str[
1519                            (ipath_read_kreg64
1520                             (dd, dd->ipath_kregs->kr_ibcstatus) >>
1521                             INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
1522                            INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]);
1523
1524         ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
1525                          dd->ipath_ibcctrl | which);
1526 }
1527
1528 /**
1529  * ipath_read_kreg64_port - read a device's per-port 64-bit kernel register
1530  * @dd: the infinipath device
1531  * @regno: the register number to read
1532  * @port: the port containing the register
1533  *
1534  * Registers that vary with the chip implementation constants (port)
1535  * use this routine.
1536  */
1537 u64 ipath_read_kreg64_port(const struct ipath_devdata *dd, ipath_kreg regno,
1538                            unsigned port)
1539 {
1540         u16 where;
1541
1542         if (port < dd->ipath_portcnt &&
1543             (regno == dd->ipath_kregs->kr_rcvhdraddr ||
1544              regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
1545                 where = regno + port;
1546         else
1547                 where = -1;
1548
1549         return ipath_read_kreg64(dd, where);
1550 }
1551
1552 /**
1553  * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
1554  * @dd: the infinipath device
1555  * @regno: the register number to write
1556  * @port: the port containing the register
1557  * @value: the value to write
1558  *
1559  * Registers that vary with the chip implementation constants (port)
1560  * use this routine.
1561  */
1562 void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
1563                           unsigned port, u64 value)
1564 {
1565         u16 where;
1566
1567         if (port < dd->ipath_portcnt &&
1568             (regno == dd->ipath_kregs->kr_rcvhdraddr ||
1569              regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
1570                 where = regno + port;
1571         else
1572                 where = -1;
1573
1574         ipath_write_kreg(dd, where, value);
1575 }
1576
1577 /**
1578  * ipath_shutdown_device - shut down a device
1579  * @dd: the infinipath device
1580  *
1581  * This is called to make the device quiet when we are about to
1582  * unload the driver, and also when the device is administratively
1583  * disabled.   It does not free any data structures.
1584  * Everything it does has to be setup again by ipath_init_chip(dd,1)
1585  */
1586 void ipath_shutdown_device(struct ipath_devdata *dd)
1587 {
1588         u64 val;
1589
1590         ipath_dbg("Shutting down the device\n");
1591
1592         dd->ipath_flags |= IPATH_LINKUNK;
1593         dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
1594                              IPATH_LINKINIT | IPATH_LINKARMED |
1595                              IPATH_LINKACTIVE);
1596         *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
1597                                 IPATH_STATUS_IB_READY);
1598
1599         /* mask interrupts, but not errors */
1600         ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
1601
1602         dd->ipath_rcvctrl = 0;
1603         ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
1604                          dd->ipath_rcvctrl);
1605
1606         /*
1607          * gracefully stop all sends allowing any in progress to trickle out
1608          * first.
1609          */
1610         ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0ULL);
1611         /* flush it */
1612         val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1613         /*
1614          * enough for anything that's going to trickle out to have actually
1615          * done so.
1616          */
1617         udelay(5);
1618
1619         /*
1620          * abort any armed or launched PIO buffers that didn't go. (self
1621          * clearing).  Will cause any packet currently being transmitted to
1622          * go out with an EBP, and may also cause a short packet error on
1623          * the receiver.
1624          */
1625         ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1626                          INFINIPATH_S_ABORT);
1627
1628         ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
1629                             INFINIPATH_IBCC_LINKINITCMD_SHIFT);
1630
1631         /*
1632          * we are shutting down, so tell the layered driver.  We don't do
1633          * this on just a link state change, much like ethernet, a cable
1634          * unplug, etc. doesn't change driver state
1635          */
1636         ipath_layer_intr(dd, IPATH_LAYER_INT_IF_DOWN);
1637
1638         /* disable IBC */
1639         dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
1640         ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
1641                          dd->ipath_control);
1642
1643         /*
1644          * clear SerdesEnable and turn the leds off; do this here because
1645          * we are unloading, so don't count on interrupts to move along
1646          * Turn the LEDs off explictly for the same reason.
1647          */
1648         dd->ipath_f_quiet_serdes(dd);
1649         dd->ipath_f_setextled(dd, 0, 0);
1650
1651         if (dd->ipath_stats_timer_active) {
1652                 del_timer_sync(&dd->ipath_stats_timer);
1653                 dd->ipath_stats_timer_active = 0;
1654         }
1655
1656         /*
1657          * clear all interrupts and errors, so that the next time the driver
1658          * is loaded or device is enabled, we know that whatever is set
1659          * happened while we were unloaded
1660          */
1661         ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
1662                          ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
1663         ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
1664         ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
1665 }
1666
1667 /**
1668  * ipath_free_pddata - free a port's allocated data
1669  * @dd: the infinipath device
1670  * @port: the port
1671  * @freehdrq: free the port data structure if true
1672  *
1673  * when closing, free up any allocated data for a port, if the
1674  * reference count goes to zero
1675  * Note: this also optionally frees the portdata itself!
1676  * Any changes here have to be matched up with the reinit case
1677  * of ipath_init_chip(), which calls this routine on reinit after reset.
1678  */
1679 void ipath_free_pddata(struct ipath_devdata *dd, u32 port, int freehdrq)
1680 {
1681         struct ipath_portdata *pd = dd->ipath_pd[port];
1682
1683         if (!pd)
1684                 return;
1685         if (freehdrq)
1686                 /*
1687                  * only clear and free portdata if we are going to also
1688                  * release the hdrq, otherwise we leak the hdrq on each
1689                  * open/close cycle
1690                  */
1691                 dd->ipath_pd[port] = NULL;
1692         if (freehdrq && pd->port_rcvhdrq) {
1693                 ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
1694                            "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
1695                            (unsigned long) pd->port_rcvhdrq_size);
1696                 dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
1697                                   pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
1698                 pd->port_rcvhdrq = NULL;
1699         }
1700         if (port && pd->port_rcvegrbuf) {
1701                 /* always free this */
1702                 if (pd->port_rcvegrbuf) {
1703                         unsigned e;
1704
1705                         for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
1706                                 void *base = pd->port_rcvegrbuf[e];
1707                                 size_t size = pd->port_rcvegrbuf_size;
1708
1709                                 ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
1710                                            "chunk %u/%u\n", base,
1711                                            (unsigned long) size,
1712                                            e, pd->port_rcvegrbuf_chunks);
1713                                 dma_free_coherent(
1714                                         &dd->pcidev->dev, size, base,
1715                                         pd->port_rcvegrbuf_phys[e]);
1716                         }
1717                         vfree(pd->port_rcvegrbuf);
1718                         pd->port_rcvegrbuf = NULL;
1719                         vfree(pd->port_rcvegrbuf_phys);
1720                         pd->port_rcvegrbuf_phys = NULL;
1721                 }
1722                 pd->port_rcvegrbuf_chunks = 0;
1723         } else if (port == 0 && dd->ipath_port0_skbs) {
1724                 unsigned e;
1725                 struct sk_buff **skbs = dd->ipath_port0_skbs;
1726
1727                 dd->ipath_port0_skbs = NULL;
1728                 ipath_cdbg(VERBOSE, "free closed port %d ipath_port0_skbs "
1729                            "@ %p\n", pd->port_port, skbs);
1730                 for (e = 0; e < dd->ipath_rcvegrcnt; e++)
1731                         if (skbs[e])
1732                                 dev_kfree_skb(skbs[e]);
1733                 vfree(skbs);
1734         }
1735         if (freehdrq) {
1736                 kfree(pd->port_tid_pg_list);
1737                 kfree(pd);
1738         }
1739 }
1740
1741 static int __init infinipath_init(void)
1742 {
1743         int ret;
1744
1745         ipath_dbg(KERN_INFO DRIVER_LOAD_MSG "%s", ipath_core_version);
1746
1747         /*
1748          * These must be called before the driver is registered with
1749          * the PCI subsystem.
1750          */
1751         idr_init(&unit_table);
1752         if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
1753                 ret = -ENOMEM;
1754                 goto bail;
1755         }
1756
1757         ret = pci_register_driver(&ipath_driver);
1758         if (ret < 0) {
1759                 printk(KERN_ERR IPATH_DRV_NAME
1760                        ": Unable to register driver: error %d\n", -ret);
1761                 goto bail_unit;
1762         }
1763
1764         ret = ipath_driver_create_group(&ipath_driver.driver);
1765         if (ret < 0) {
1766                 printk(KERN_ERR IPATH_DRV_NAME ": Unable to create driver "
1767                        "sysfs entries: error %d\n", -ret);
1768                 goto bail_pci;
1769         }
1770
1771         ret = ipath_init_ipathfs();
1772         if (ret < 0) {
1773                 printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
1774                        "ipathfs: error %d\n", -ret);
1775                 goto bail_group;
1776         }
1777
1778         goto bail;
1779
1780 bail_group:
1781         ipath_driver_remove_group(&ipath_driver.driver);
1782
1783 bail_pci:
1784         pci_unregister_driver(&ipath_driver);
1785
1786 bail_unit:
1787         idr_destroy(&unit_table);
1788
1789 bail:
1790         return ret;
1791 }
1792
1793 static void cleanup_device(struct ipath_devdata *dd)
1794 {
1795         int port;
1796
1797         ipath_shutdown_device(dd);
1798
1799         if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
1800                 /* can't do anything more with chip; needs re-init */
1801                 *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
1802                 if (dd->ipath_kregbase) {
1803                         /*
1804                          * if we haven't already cleaned up before these are
1805                          * to ensure any register reads/writes "fail" until
1806                          * re-init
1807                          */
1808                         dd->ipath_kregbase = NULL;
1809                         dd->ipath_kregvirt = NULL;
1810                         dd->ipath_uregbase = 0;
1811                         dd->ipath_sregbase = 0;
1812                         dd->ipath_cregbase = 0;
1813                         dd->ipath_kregsize = 0;
1814                 }
1815                 ipath_disable_wc(dd);
1816         }
1817
1818         if (dd->ipath_pioavailregs_dma) {
1819                 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
1820                                   (void *) dd->ipath_pioavailregs_dma,
1821                                   dd->ipath_pioavailregs_phys);
1822                 dd->ipath_pioavailregs_dma = NULL;
1823         }
1824
1825         if (dd->ipath_pageshadow) {
1826                 struct page **tmpp = dd->ipath_pageshadow;
1827                 int i, cnt = 0;
1828
1829                 ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
1830                            "locked\n");
1831                 for (port = 0; port < dd->ipath_cfgports; port++) {
1832                         int port_tidbase = port * dd->ipath_rcvtidcnt;
1833                         int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
1834                         for (i = port_tidbase; i < maxtid; i++) {
1835                                 if (!tmpp[i])
1836                                         continue;
1837                                 ipath_release_user_pages(&tmpp[i], 1);
1838                                 tmpp[i] = NULL;
1839                                 cnt++;
1840                         }
1841                 }
1842                 if (cnt) {
1843                         ipath_stats.sps_pageunlocks += cnt;
1844                         ipath_cdbg(VERBOSE, "There were still %u expTID "
1845                                    "entries locked\n", cnt);
1846                 }
1847                 if (ipath_stats.sps_pagelocks ||
1848                     ipath_stats.sps_pageunlocks)
1849                         ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
1850                                    "unlocked via ipath_m{un}lock\n",
1851                                    (unsigned long long)
1852                                    ipath_stats.sps_pagelocks,
1853                                    (unsigned long long)
1854                                    ipath_stats.sps_pageunlocks);
1855
1856                 ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
1857                            dd->ipath_pageshadow);
1858                 vfree(dd->ipath_pageshadow);
1859                 dd->ipath_pageshadow = NULL;
1860         }
1861
1862         /*
1863          * free any resources still in use (usually just kernel ports)
1864          * at unload
1865          */
1866         for (port = 0; port < dd->ipath_cfgports; port++)
1867                 ipath_free_pddata(dd, port, 1);
1868         kfree(dd->ipath_pd);
1869         /*
1870          * debuggability, in case some cleanup path tries to use it
1871          * after this
1872          */
1873         dd->ipath_pd = NULL;
1874 }
1875
1876 static void __exit infinipath_cleanup(void)
1877 {
1878         struct ipath_devdata *dd, *tmp;
1879         unsigned long flags;
1880
1881         ipath_exit_ipathfs();
1882
1883         ipath_driver_remove_group(&ipath_driver.driver);
1884
1885         spin_lock_irqsave(&ipath_devs_lock, flags);
1886
1887         /*
1888          * turn off rcv, send, and interrupts for all ports, all drivers
1889          * should also hard reset the chip here?
1890          * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
1891          * for all versions of the driver, if they were allocated
1892          */
1893         list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
1894                 spin_unlock_irqrestore(&ipath_devs_lock, flags);
1895
1896                 if (dd->ipath_kregbase)
1897                         cleanup_device(dd);
1898
1899                 if (dd->pcidev) {
1900                         if (dd->pcidev->irq) {
1901                                 ipath_cdbg(VERBOSE,
1902                                            "unit %u free_irq of irq %x\n",
1903                                            dd->ipath_unit, dd->pcidev->irq);
1904                                 free_irq(dd->pcidev->irq, dd);
1905                         } else
1906                                 ipath_dbg("irq is 0, not doing free_irq "
1907                                           "for unit %u\n", dd->ipath_unit);
1908
1909                         /*
1910                          * we check for NULL here, because it's outside
1911                          * the kregbase check, and we need to call it
1912                          * after the free_irq.  Thus it's possible that
1913                          * the function pointers were never initialized.
1914                          */
1915                         if (dd->ipath_f_cleanup)
1916                                 /* clean up chip-specific stuff */
1917                                 dd->ipath_f_cleanup(dd);
1918
1919                         dd->pcidev = NULL;
1920                 }
1921                 spin_lock_irqsave(&ipath_devs_lock, flags);
1922         }
1923
1924         spin_unlock_irqrestore(&ipath_devs_lock, flags);
1925
1926         ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
1927         pci_unregister_driver(&ipath_driver);
1928
1929         idr_destroy(&unit_table);
1930 }
1931
1932 /**
1933  * ipath_reset_device - reset the chip if possible
1934  * @unit: the device to reset
1935  *
1936  * Whether or not reset is successful, we attempt to re-initialize the chip
1937  * (that is, much like a driver unload/reload).  We clear the INITTED flag
1938  * so that the various entry points will fail until we reinitialize.  For
1939  * now, we only allow this if no user ports are open that use chip resources
1940  */
1941 int ipath_reset_device(int unit)
1942 {
1943         int ret, i;
1944         struct ipath_devdata *dd = ipath_lookup(unit);
1945
1946         if (!dd) {
1947                 ret = -ENODEV;
1948                 goto bail;
1949         }
1950
1951         dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
1952
1953         if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
1954                 dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
1955                          "not initialized or not present\n", unit);
1956                 ret = -ENXIO;
1957                 goto bail;
1958         }
1959
1960         if (dd->ipath_pd)
1961                 for (i = 1; i < dd->ipath_cfgports; i++) {
1962                         if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) {
1963                                 ipath_dbg("unit %u port %d is in use "
1964                                           "(PID %u cmd %s), can't reset\n",
1965                                           unit, i,
1966                                           dd->ipath_pd[i]->port_pid,
1967                                           dd->ipath_pd[i]->port_comm);
1968                                 ret = -EBUSY;
1969                                 goto bail;
1970                         }
1971                 }
1972
1973         dd->ipath_flags &= ~IPATH_INITTED;
1974         ret = dd->ipath_f_reset(dd);
1975         if (ret != 1)
1976                 ipath_dbg("reset was not successful\n");
1977         ipath_dbg("Trying to reinitialize unit %u after reset attempt\n",
1978                   unit);
1979         ret = ipath_init_chip(dd, 1);
1980         if (ret)
1981                 ipath_dev_err(dd, "Reinitialize unit %u after "
1982                               "reset failed with %d\n", unit, ret);
1983         else
1984                 dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
1985                          "resetting\n", unit);
1986
1987 bail:
1988         return ret;
1989 }
1990
1991 module_init(infinipath_init);
1992 module_exit(infinipath_cleanup);