git.oblomov.eu Git - linux-2.6/blob - arch/powerpc/platforms/pseries/eeh_driver.c

   1 /*
   2  * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
   3  * Copyright IBM Corp. 2004 2005
   4  * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
   5  *
   6  * All rights reserved.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or (at
  11  * your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful, but
  14  * WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  16  * NON INFRINGEMENT.  See the GNU General Public License for more
  17  * details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22  *
  23  * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
  24  */
  25 #include <linux/delay.h>
  26 #include <linux/interrupt.h>
  27 #include <linux/irq.h>
  28 #include <linux/pci.h>
  29 #include <asm/eeh.h>
  30 #include <asm/eeh_event.h>
  31 #include <asm/ppc-pci.h>
  32 #include <asm/pci-bridge.h>
  33 #include <asm/prom.h>
  34 #include <asm/rtas.h>
  35
  36
  37 static inline const char * pcid_name (struct pci_dev *pdev)
  38 {
  39         if (pdev && pdev->dev.driver)
  40                 return pdev->dev.driver->name;
  41         return "";
  42 }
  43
  44 #ifdef DEBUG
  45 static void print_device_node_tree (struct pci_dn *pdn, int dent)
  46 {
  47         int i;
  48         if (!pdn) return;
  49         for (i=0;i<dent; i++)
  50                 printk(" ");
  51         printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
  52                 pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
  53                 pdn->eeh_pe_config_addr, pdn->node->full_name);
  54         dent += 3;
  55         struct device_node *pc = pdn->node->child;
  56         while (pc) {
  57                 print_device_node_tree(PCI_DN(pc), dent);
  58                 pc = pc->sibling;
  59         }
  60 }
  61 #endif
  62
  63 /**
  64  * irq_in_use - return true if this irq is being used
  65  */
  66 static int irq_in_use(unsigned int irq)
  67 {
  68         int rc = 0;
  69         unsigned long flags;
  70    struct irq_desc *desc = irq_desc + irq;
  71
  72         spin_lock_irqsave(&desc->lock, flags);
  73         if (desc->action)
  74                 rc = 1;
  75         spin_unlock_irqrestore(&desc->lock, flags);
  76         return rc;
  77 }
  78
  79 /* ------------------------------------------------------- */
  80 /**
  81  * eeh_report_error - report pci error to each device driver
  82  *
  83  * Report an EEH error to each device driver, collect up and
  84  * merge the device driver responses. Cumulative response
  85  * passed back in "userdata".
  86  */
  87
  88 static void eeh_report_error(struct pci_dev *dev, void *userdata)
  89 {
  90         enum pci_ers_result rc, *res = userdata;
  91         struct pci_driver *driver = dev->driver;
  92
  93         dev->error_state = pci_channel_io_frozen;
  94
  95         if (!driver)
  96                 return;
  97
  98         if (irq_in_use (dev->irq)) {
  99                 struct device_node *dn = pci_device_to_OF_node(dev);
 100                 PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
 101                 disable_irq_nosync(dev->irq);
 102         }
 103         if (!driver->err_handler ||
 104             !driver->err_handler->error_detected)
 105                 return;
 106
 107         rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);
 108
 109         /* A driver that needs a reset trumps all others */
 110         if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 111         if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 112 }
 113
 114 /**
 115  * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled
 116  *
 117  * Tells each device driver that IO ports, MMIO and config space I/O
 118  * are now enabled. Collects up and merges the device driver responses.
 119  * Cumulative response passed back in "userdata".
 120  */
 121
 122 static void eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 123 {
 124         enum pci_ers_result rc, *res = userdata;
 125         struct pci_driver *driver = dev->driver;
 126
 127         if (!driver ||
 128             !driver->err_handler ||
 129             !driver->err_handler->mmio_enabled)
 130                 return;
 131
 132         rc = driver->err_handler->mmio_enabled (dev);
 133
 134         /* A driver that needs a reset trumps all others */
 135         if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 136         if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 137 }
 138
 139 /**
 140  * eeh_report_reset - tell device that slot has been reset
 141  */
 142
 143 static void eeh_report_reset(struct pci_dev *dev, void *userdata)
 144 {
 145         enum pci_ers_result rc, *res = userdata;
 146         struct pci_driver *driver = dev->driver;
 147         struct device_node *dn = pci_device_to_OF_node(dev);
 148
 149         if (!driver)
 150                 return;
 151
 152         if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) {
 153                 PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;
 154                 enable_irq(dev->irq);
 155         }
 156         if (!driver->err_handler ||
 157             !driver->err_handler->slot_reset)
 158                 return;
 159
 160         rc = driver->err_handler->slot_reset(dev);
 161         if ((*res == PCI_ERS_RESULT_NONE) ||
 162             (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
 163         if (*res == PCI_ERS_RESULT_DISCONNECT &&
 164              rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 165 }
 166
 167 /**
 168  * eeh_report_resume - tell device to resume normal operations
 169  */
 170
 171 static void eeh_report_resume(struct pci_dev *dev, void *userdata)
 172 {
 173         struct pci_driver *driver = dev->driver;
 174         struct device_node *dn = pci_device_to_OF_node(dev);
 175
 176         dev->error_state = pci_channel_io_normal;
 177
 178         if (!driver)
 179                 return;
 180
 181         if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) {
 182                 PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;
 183                 enable_irq(dev->irq);
 184         }
 185         if (!driver->err_handler ||
 186             !driver->err_handler->resume)
 187                 return;
 188
 189         driver->err_handler->resume(dev);
 190 }
 191
 192 /**
 193  * eeh_report_failure - tell device driver that device is dead.
 194  *
 195  * This informs the device driver that the device is permanently
 196  * dead, and that no further recovery attempts will be made on it.
 197  */
 198
 199 static void eeh_report_failure(struct pci_dev *dev, void *userdata)
 200 {
 201         struct pci_driver *driver = dev->driver;
 202
 203         dev->error_state = pci_channel_io_perm_failure;
 204
 205         if (!driver)
 206                 return;
 207
 208         if (irq_in_use (dev->irq)) {
 209                 struct device_node *dn = pci_device_to_OF_node(dev);
 210                 PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
 211                 disable_irq_nosync(dev->irq);
 212         }
 213         if (!driver->err_handler)
 214                 return;
 215         if (!driver->err_handler->error_detected)
 216                 return;
 217         driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 218 }
 219
 220 /* ------------------------------------------------------- */
 221 /**
 222  * handle_eeh_events -- reset a PCI device after hard lockup.
 223  *
 224  * pSeries systems will isolate a PCI slot if the PCI-Host
 225  * bridge detects address or data parity errors, DMA's
 226  * occurring to wild addresses (which usually happen due to
 227  * bugs in device drivers or in PCI adapter firmware).
 228  * Slot isolations also occur if #SERR, #PERR or other misc
 229  * PCI-related errors are detected.
 230  *
 231  * Recovery process consists of unplugging the device driver
 232  * (which generated hotplug events to userspace), then issuing
 233  * a PCI #RST to the device, then reconfiguring the PCI config
 234  * space for all bridges & devices under this slot, and then
 235  * finally restarting the device drivers (which cause a second
 236  * set of hotplug events to go out to userspace).
 237  */
 238
 239 /**
 240  * eeh_reset_device() -- perform actual reset of a pci slot
 241  * @bus: pointer to the pci bus structure corresponding
 242  *            to the isolated slot. A non-null value will
 243  *            cause all devices under the bus to be removed
 244  *            and then re-added.
 245  * @pe_dn: pointer to a "Partionable Endpoint" device node.
 246  *            This is the top-level structure on which pci
 247  *            bus resets can be performed.
 248  */
 249
 250 static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
 251 {
 252         struct device_node *dn;
 253         int cnt, rc;
 254
 255         /* pcibios will clear the counter; save the value */
 256         cnt = pe_dn->eeh_freeze_count;
 257
 258         if (bus)
 259                 pcibios_remove_pci_devices(bus);
 260
 261         /* Reset the pci controller. (Asserts RST#; resets config space).
 262          * Reconfigure bridges and devices. Don't try to bring the system
 263          * up if the reset failed for some reason. */
 264         rc = rtas_set_slot_reset(pe_dn);
 265         if (rc)
 266                 return rc;
 267
 268         /* Walk over all functions on this device.  */
 269         dn = pe_dn->node;
 270         if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
 271                 dn = dn->parent->child;
 272
 273         while (dn) {
 274                 struct pci_dn *ppe = PCI_DN(dn);
 275                 /* On Power4, always true because eeh_pe_config_addr=0 */
 276                 if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) {
 277                         rtas_configure_bridge(ppe);
 278                         eeh_restore_bars(ppe);
 279                 }
 280                 dn = dn->sibling;
 281         }
 282
 283         /* Give the system 5 seconds to finish running the user-space
 284          * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
 285          * this is a hack, but if we don't do this, and try to bring
 286          * the device up before the scripts have taken it down,
 287          * potentially weird things happen.
 288          */
 289         if (bus) {
 290                 ssleep (5);
 291                 pcibios_add_pci_devices(bus);
 292         }
 293         pe_dn->eeh_freeze_count = cnt;
 294
 295         return 0;
 296 }
 297
 298 /* The longest amount of time to wait for a pci device
 299  * to come back on line, in seconds.
 300  */
 301 #define MAX_WAIT_FOR_RECOVERY 150
 302
 303 struct pci_dn * handle_eeh_events (struct eeh_event *event)
 304 {
 305         struct device_node *frozen_dn;
 306         struct pci_dn *frozen_pdn;
 307         struct pci_bus *frozen_bus;
 308         int rc = 0;
 309         enum pci_ers_result result = PCI_ERS_RESULT_NONE;
 310         const char *location, *pci_str, *drv_str;
 311
 312         frozen_dn = find_device_pe(event->dn);
 313         if (!frozen_dn) {
 314
 315                 location = of_get_property(event->dn, "ibm,loc-code", NULL);
 316                 location = location ? location : "unknown";
 317                 printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
 318                                 "for location=%s pci addr=%s\n",
 319                         location, pci_name(event->dev));
 320                 return NULL;
 321         }
 322
 323         frozen_bus = pcibios_find_pci_bus(frozen_dn);
 324         location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
 325         location = location ? location : "unknown";
 326
 327         /* There are two different styles for coming up with the PE.
 328          * In the old style, it was the highest EEH-capable device
 329          * which was always an EADS pci bridge.  In the new style,
 330          * there might not be any EADS bridges, and even when there are,
 331          * the firmware marks them as "EEH incapable". So another
 332          * two-step is needed to find the pci bus.. */
 333         if (!frozen_bus)
 334                 frozen_bus = pcibios_find_pci_bus (frozen_dn->parent);
 335
 336         if (!frozen_bus) {
 337                 printk(KERN_ERR "EEH: Cannot find PCI bus "
 338                         "for location=%s dn=%s\n",
 339                         location, frozen_dn->full_name);
 340                 return NULL;
 341         }
 342
 343         frozen_pdn = PCI_DN(frozen_dn);
 344         frozen_pdn->eeh_freeze_count++;
 345
 346         if (frozen_pdn->pcidev) {
 347                 pci_str = pci_name (frozen_pdn->pcidev);
 348                 drv_str = pcid_name (frozen_pdn->pcidev);
 349         } else {
 350                 pci_str = pci_name (event->dev);
 351                 drv_str = pcid_name (event->dev);
 352         }
 353
 354         if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)
 355                 goto excess_failures;
 356
 357         printk(KERN_WARNING
 358            "EEH: This PCI device has failed %d times in the last hour:\n",
 359                 frozen_pdn->eeh_freeze_count);
 360         printk(KERN_WARNING
 361                 "EEH: location=%s driver=%s pci addr=%s\n",
 362                 location, drv_str, pci_str);
 363
 364         /* Walk the various device drivers attached to this slot through
 365          * a reset sequence, giving each an opportunity to do what it needs
 366          * to accomplish the reset.  Each child gets a report of the
 367          * status ... if any child can't handle the reset, then the entire
 368          * slot is dlpar removed and added.
 369          */
 370         pci_walk_bus(frozen_bus, eeh_report_error, &result);
 371
 372         /* Get the current PCI slot state. This can take a long time,
 373          * sometimes over 3 seconds for certain systems. */
 374         rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000);
 375         if (rc < 0) {
 376                 printk(KERN_WARNING "EEH: Permanent failure\n");
 377                 goto hard_fail;
 378         }
 379
 380         /* Since rtas may enable MMIO when posting the error log,
 381          * don't post the error log until after all dev drivers
 382          * have been informed.
 383          */
 384         eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE);
 385
 386         /* If all device drivers were EEH-unaware, then shut
 387          * down all of the device drivers, and hope they
 388          * go down willingly, without panicing the system.
 389          */
 390         if (result == PCI_ERS_RESULT_NONE) {
 391                 rc = eeh_reset_device(frozen_pdn, frozen_bus);
 392                 if (rc) {
 393                         printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
 394                         goto hard_fail;
 395                 }
 396         }
 397
 398         /* If all devices reported they can proceed, then re-enable MMIO */
 399         if (result == PCI_ERS_RESULT_CAN_RECOVER) {
 400                 rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO);
 401
 402                 if (rc < 0)
 403                         goto hard_fail;
 404                 if (rc) {
 405                         result = PCI_ERS_RESULT_NEED_RESET;
 406                 } else {
 407                         result = PCI_ERS_RESULT_NONE;
 408                         pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
 409                 }
 410         }
 411
 412         /* If all devices reported they can proceed, then re-enable DMA */
 413         if (result == PCI_ERS_RESULT_CAN_RECOVER) {
 414                 rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA);
 415
 416                 if (rc < 0)
 417                         goto hard_fail;
 418                 if (rc)
 419                         result = PCI_ERS_RESULT_NEED_RESET;
 420                 else
 421                         result = PCI_ERS_RESULT_RECOVERED;
 422         }
 423
 424         /* If any device has a hard failure, then shut off everything. */
 425         if (result == PCI_ERS_RESULT_DISCONNECT) {
 426                 printk(KERN_WARNING "EEH: Device driver gave up\n");
 427                 goto hard_fail;
 428         }
 429
 430         /* If any device called out for a reset, then reset the slot */
 431         if (result == PCI_ERS_RESULT_NEED_RESET) {
 432                 rc = eeh_reset_device(frozen_pdn, NULL);
 433                 if (rc) {
 434                         printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
 435                         goto hard_fail;
 436                 }
 437                 result = PCI_ERS_RESULT_NONE;
 438                 pci_walk_bus(frozen_bus, eeh_report_reset, &result);
 439         }
 440
 441         /* All devices should claim they have recovered by now. */
 442         if ((result != PCI_ERS_RESULT_RECOVERED) &&
 443             (result != PCI_ERS_RESULT_NONE)) {
 444                 printk(KERN_WARNING "EEH: Not recovered\n");
 445                 goto hard_fail;
 446         }
 447
 448         /* Tell all device drivers that they can resume operations */
 449         pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
 450
 451         return frozen_pdn;
 452
 453 excess_failures:
 454         /*
 455          * About 90% of all real-life EEH failures in the field
 456          * are due to poorly seated PCI cards. Only 10% or so are
 457          * due to actual, failed cards.
 458          */
 459         printk(KERN_ERR
 460            "EEH: PCI device at location=%s driver=%s pci addr=%s \n"
 461                 "has failed %d times in the last hour "
 462                 "and has been permanently disabled. \n"
 463                 "Please try reseating this device or replacing it.\n",
 464                 location, drv_str, pci_str, frozen_pdn->eeh_freeze_count);
 465         goto perm_error;
 466
 467 hard_fail:
 468         printk(KERN_ERR
 469            "EEH: Unable to recover from failure of PCI device "
 470            "at location=%s driver=%s pci addr=%s \n"
 471            "Please try reseating this device or replacing it.\n",
 472                 location, drv_str, pci_str);
 473
 474 perm_error:
 475         eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE);
 476
 477         /* Notify all devices that they're about to go down. */
 478         pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
 479
 480         /* Shut down the device drivers for good. */
 481         pcibios_remove_pci_devices(frozen_bus);
 482
 483         return NULL;
 484 }
 485
 486 /* ---------- end of file ---------- */