--- orig/drivers/scsi/scsi_error.c Wed Sep 25 23:35:47 2002 +++ linux-rpc/drivers/scsi/scsi_error.c Wed Sep 25 23:37:31 2002 @@ -1680,9 +1680,9 @@ static void scsi_unjam_bus_reset(struct Scsi_Host *host, Scsi_Cmnd **done) { - Scsi_Device *SDpnt, *SDloop; - Scsi_Cmnd *SCpnt, *SCloop; - int rtn; + Scsi_Device *SDpnt; + Scsi_Cmnd *SCpnt; + int rtn, channel, max_channel = 0; /* * If we ended up here, we have serious problems. The only thing left @@ -1691,69 +1691,105 @@ */ SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n")); - /* - * We really want to loop over the various channels, and do this on - * a channel by channel basis. We should also check to see if any - * of the failed commands are on soft_reset devices, and if so, skip - * the reset. + /* + * Find the maximum channel number for this host. */ - for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { - next_device: - for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) { - if (SCpnt->state != SCSI_STATE_FAILED - && SCpnt->state != SCSI_STATE_TIMEOUT) { + for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) + if (SDpnt->channel > max_channel) + max_channel = SDpnt->channel; + + /* + * Loop over each channel, and see if it any device on + * each channel has failed. + */ + for (channel = 0; channel <= max_channel; channel++) { + Scsi_Cmnd *failed_command; + int soft_reset; + + try_again: + failed_command = NULL; + soft_reset = 0; + + /* + * Loop over each device on this channel locating any + * failed command. We need a Scsi_Cmnd structure to + * call the bus reset function. + * + * We also need to check if any of the failed commands + * are on soft_reset devices, and if so, skip the reset. + */ + for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { + if (SDpnt->channel != channel) continue; - } - /* - * We have a failed command. Make sure there are no other failed - * commands on the same channel that are timed out and implement a - * soft reset. - */ - for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { - for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) { - if (SCloop->channel != SCpnt->channel) { - continue; - } - if (SCloop->state != SCSI_STATE_FAILED - && SCloop->state != SCSI_STATE_TIMEOUT) { - continue; - } - if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) { - /* - * If this device uses the soft reset option, and this - * is one of the devices acting up, then our only - * option is to wait a bit, since the command is - * supposedly still running. - * - * FIXME(eric) - right now we will just end up falling - * through to the 'take device offline' case. - * - * FIXME(eric) - It is possible that the command completed - * *after* the error recovery procedure started, and if this - * is the case, we are worrying about nothing here. - */ - - scsi_sleep(1 * HZ); - goto next_device; - } - } - } + + SCpnt = scsi_eh_find_failed_command(SDpnt); + if (SCpnt) + failed_command = SCpnt; /* - * We now know that we are able to perform a reset for the - * bus that SCpnt points to. There are no soft-reset devices - * with outstanding timed out commands. + * If this device has timed out or failed commands, + * and uses the soft_reset option. */ - rtn = scsi_try_bus_reset(SCpnt); - if (rtn == SUCCESS) { - for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { - rtn = scsi_eh_restart_device(SDloop, done); - - if (rtn != SUCCESS) - scsi_eh_set_device_offline(SDloop, done, "not ready or command retry failed after bus reset"); - } - } - break; + if (SCpnt && SDpnt->soft_reset) + soft_reset = 1; + } + + /* + * If this channel hasn't failed, we + * don't need to reset it. + */ + if (!failed_command) + continue; + + /* + * If this device uses the soft reset option, and this + * is one of the devices acting up, then our only + * option is to wait a bit, since the command is + * supposedly still running. + * + * FIXME(eric) - right now we will just end up falling + * through to the 'take device offline' case. + * + * FIXME(eric) - It is possible that the command completed + * *after* the error recovery procedure started, and if this + * is the case, we are worrying about nothing here. + * + * FIXME(rmk) - This should be bounded; we shouldn't wait + * for an infinite amount of time for any device. + */ + if (soft_reset) { + SCSI_LOG_ERROR_RECOVERY(3, + printk("scsi_unjam_host: unable to try bus " + "reset for host %d channel %d\n", + host->host_no, channel)); + scsi_sleep(1 * HZ); + goto try_again; + } + + /* + * We now know that we are able to perform a reset for the + * bus that SCpnt points to. There are no soft-reset devices + * with outstanding timed out commands. + */ + rtn = scsi_try_bus_reset(failed_command); + + /* + * If we failed to reset the bus, move on to the next bus. + */ + if (rtn != SUCCESS) + continue; + + /* + * We succeeded. Retry each failed command. + */ + for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { + if (SDpnt->channel != channel) + continue; + + rtn = scsi_eh_restart_device(SDpnt, done); + + if (rtn != SUCCESS) + scsi_eh_set_device_offline(SDpnt, done, "not ready or command retry failed after bus reset"); } } } @@ -1762,7 +1798,8 @@ { Scsi_Device *SDpnt, *SDloop; Scsi_Cmnd *SCpnt; - int rtn; + Scsi_Cmnd *failed_command = NULL; + int rtn, soft_reset; /* * If we ended up here, we have serious problems. The only thing left @@ -1780,58 +1817,71 @@ * skip the host reset option if any of the failed devices are configured * to use the soft reset option. */ + + try_again: + failed_command = NULL; + soft_reset = 0; + for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { - next_device2: - for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) { - if (SCpnt->state != SCSI_STATE_FAILED - && SCpnt->state != SCSI_STATE_TIMEOUT) { - continue; - } - if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) { - /* - * If this device uses the soft reset option, and this - * is one of the devices acting up, then our only - * option is to wait a bit, since the command is - * supposedly still running. - * - * FIXME(eric) - right now we will just end up falling - * through to the 'take device offline' case. - */ - SCSI_LOG_ERROR_RECOVERY(3, - printk("scsi_unjam_host: Unable to try hard host reset\n")); - - /* - * Due to the spinlock, we will never get out of this - * loop without a proper wait. (DB) - */ - scsi_sleep(1 * HZ); + /* + * Locate any failed commands for this device. + */ + SCpnt = scsi_eh_find_failed_command(SDpnt); + if (SCpnt) + failed_command = SCpnt; - goto next_device2; - } - SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n")); + /* + * If this device has timed out or failed commands, + * and uses the soft_reset option. + */ + if (SCpnt && SDpnt->soft_reset) + soft_reset = 1; + } + + /* + * If this device uses the soft reset option, and this + * is one of the devices acting up, then our only + * option is to wait a bit, since the command is + * supposedly still running. + * + * FIXME(eric) - right now we will just end up falling + * through to the 'take device offline' case. + */ + if (soft_reset) { + SCSI_LOG_ERROR_RECOVERY(3, + printk("scsi_unjam_host: unable to try " + "hard host reset\n")); /* - * FIXME(eric) - we need to obtain a valid SCpnt to perform this call. + * Due to the spinlock, we will never get out of this + * loop without a proper wait. (DB) */ - rtn = scsi_try_host_reset(SCpnt); - if (rtn == SUCCESS) { - /* - * FIXME(eric) we assume that all commands are flushed from the - * controller. We should get a DID_RESET for all of the commands - * that were pending. We should ignore these so that we can - * guarantee that we are in a consistent state. - * - * I believe this to be the case right now, but this needs to be - * tested. - */ - for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { - rtn = scsi_eh_restart_device(SDloop, done); - - if (rtn != SUCCESS) - scsi_eh_set_device_offline(SDloop, done, "not ready or command retry failed after host reset"); - } - } - return; + scsi_sleep(1 * HZ); + + goto try_again; + } + + SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n")); + + /* + * FIXME(eric) - we need to obtain a valid SCpnt to perform this call. + */ + rtn = scsi_try_host_reset(failed_command); + if (rtn == SUCCESS) { + /* + * FIXME(eric) we assume that all commands are flushed from the + * controller. We should get a DID_RESET for all of the commands + * that were pending. We should ignore these so that we can + * guarantee that we are in a consistent state. + * + * I believe this to be the case right now, but this needs to be + * tested. + */ + for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { + rtn = scsi_eh_restart_device(SDloop, done); + + if (rtn != SUCCESS) + scsi_eh_set_device_offline(SDloop, done, "not ready or command retry failed after host reset"); } } }