--- orig/drivers/scsi/scsi_error.c	Wed Sep 25 23:35:47 2002
+++ linux-rpc/drivers/scsi/scsi_error.c	Wed Sep 25 23:37:31 2002
@@ -1680,9 +1680,9 @@
 
 static void scsi_unjam_bus_reset(struct Scsi_Host *host, Scsi_Cmnd **done)
 {
-	Scsi_Device *SDpnt, *SDloop;
-	Scsi_Cmnd *SCpnt, *SCloop;
-	int rtn;
+	Scsi_Device *SDpnt;
+	Scsi_Cmnd *SCpnt;
+	int rtn, channel, max_channel = 0;
 
 	/*
 	 * If we ended up here, we have serious problems.  The only thing left
@@ -1691,69 +1691,105 @@
 	 */
 	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
 
-	/* 
-	 * We really want to loop over the various channels, and do this on
-	 * a channel by channel basis.  We should also check to see if any
-	 * of the failed commands are on soft_reset devices, and if so, skip
-	 * the reset.  
+	/*
+	 * Find the maximum channel number for this host.
 	 */
-	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
-	      next_device:
-		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
-			if (SCpnt->state != SCSI_STATE_FAILED
-			    && SCpnt->state != SCSI_STATE_TIMEOUT) {
+	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next)
+		if (SDpnt->channel > max_channel)
+			max_channel = SDpnt->channel;
+
+	/*
+	 * Loop over each channel, and see if it any device on
+	 * each channel has failed.
+	 */
+	for (channel = 0; channel <= max_channel; channel++) {
+		Scsi_Cmnd *failed_command;
+		int soft_reset;
+
+ try_again:
+ 		failed_command = NULL;
+ 		soft_reset = 0;
+
+		/*
+		 * Loop over each device on this channel locating any
+		 * failed command.  We need a Scsi_Cmnd structure to
+		 * call the bus reset function.
+		 *
+		 * We also need to check if any of the failed commands
+		 * are on soft_reset devices, and if so, skip the reset.  
+		 */
+		for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
+			if (SDpnt->channel != channel)
 				continue;
-			}
-			/*
-			 * We have a failed command.  Make sure there are no other failed
-			 * commands on the same channel that are timed out and implement a
-			 * soft reset.
-			 */
-			for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
-				for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
-					if (SCloop->channel != SCpnt->channel) {
-						continue;
-					}
-					if (SCloop->state != SCSI_STATE_FAILED
-					    && SCloop->state != SCSI_STATE_TIMEOUT) {
-						continue;
-					}
-					if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
-						/* 
-						 * If this device uses the soft reset option, and this
-						 * is one of the devices acting up, then our only
-						 * option is to wait a bit, since the command is
-						 * supposedly still running.  
-						 *
-						 * FIXME(eric) - right now we will just end up falling
-						 * through to the 'take device offline' case.
-						 *
-						 * FIXME(eric) - It is possible that the command completed
-						 * *after* the error recovery procedure started, and if this
-						 * is the case, we are worrying about nothing here.
-						 */
-
-						scsi_sleep(1 * HZ);
-						goto next_device;
-					}
-				}
-			}
+
+			SCpnt = scsi_eh_find_failed_command(SDpnt);
+			if (SCpnt)
+				failed_command = SCpnt;
 
 			/*
-			 * We now know that we are able to perform a reset for the
-			 * bus that SCpnt points to.  There are no soft-reset devices
-			 * with outstanding timed out commands.
+			 * If this device has timed out or failed commands,
+			 * and uses the soft_reset option.
 			 */
-			rtn = scsi_try_bus_reset(SCpnt);
-			if (rtn == SUCCESS) {
-				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
-					rtn = scsi_eh_restart_device(SDloop, done);
-
-					if (rtn != SUCCESS)
-						scsi_eh_set_device_offline(SDloop, done, "not ready or command retry failed after bus reset");
-				}
-			}
-			break;
+			if (SCpnt && SDpnt->soft_reset)
+				soft_reset = 1;
+		}
+
+		/*
+		 * If this channel hasn't failed, we
+		 * don't need to reset it.
+		 */
+		if (!failed_command)
+			continue;
+
+		/* 
+		 * If this device uses the soft reset option, and this
+		 * is one of the devices acting up, then our only
+		 * option is to wait a bit, since the command is
+		 * supposedly still running.  
+		 *
+		 * FIXME(eric) - right now we will just end up falling
+		 * through to the 'take device offline' case.
+		 *
+		 * FIXME(eric) - It is possible that the command completed
+		 * *after* the error recovery procedure started, and if this
+		 * is the case, we are worrying about nothing here.
+		 *
+		 * FIXME(rmk) - This should be bounded; we shouldn't wait
+		 * for an infinite amount of time for any device.
+		 */
+		if (soft_reset) {
+			SCSI_LOG_ERROR_RECOVERY(3,
+				printk("scsi_unjam_host: unable to try bus "
+					"reset for host %d channel %d\n",
+					host->host_no, channel));
+			scsi_sleep(1 * HZ);
+			goto try_again;
+		}
+
+		/*
+		 * We now know that we are able to perform a reset for the
+		 * bus that SCpnt points to.  There are no soft-reset devices
+		 * with outstanding timed out commands.
+		 */
+		rtn = scsi_try_bus_reset(failed_command);
+
+		/*
+		 * If we failed to reset the bus, move on to the next bus.
+		 */
+		if (rtn != SUCCESS)
+			continue;
+
+		/*
+		 * We succeeded.  Retry each failed command.
+		 */
+		for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
+			if (SDpnt->channel != channel)
+				continue;
+
+			rtn = scsi_eh_restart_device(SDpnt, done);
+
+			if (rtn != SUCCESS)
+				scsi_eh_set_device_offline(SDpnt, done, "not ready or command retry failed after bus reset");
 		}
 	}
 }
@@ -1762,7 +1798,8 @@
 {
 	Scsi_Device *SDpnt, *SDloop;
 	Scsi_Cmnd *SCpnt;
-	int rtn;
+	Scsi_Cmnd *failed_command = NULL;
+	int rtn, soft_reset;
 
 	/*
 	 * If we ended up here, we have serious problems.  The only thing left
@@ -1780,58 +1817,71 @@
 	 * skip the host reset option if any of the failed devices are configured
 	 * to use the soft reset option.
 	 */
+
+ try_again:
+	failed_command = NULL;
+	soft_reset = 0;
+
 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
-	      next_device2:
-		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
-			if (SCpnt->state != SCSI_STATE_FAILED
-			    && SCpnt->state != SCSI_STATE_TIMEOUT) {
-				continue;
-			}
-			if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
-				/* 
-				 * If this device uses the soft reset option, and this
-				 * is one of the devices acting up, then our only
-				 * option is to wait a bit, since the command is
-				 * supposedly still running.  
-				 *
-				 * FIXME(eric) - right now we will just end up falling
-				 * through to the 'take device offline' case.
-				 */
-				SCSI_LOG_ERROR_RECOVERY(3,
-							printk("scsi_unjam_host: Unable to try hard host reset\n"));
-
-				/*
-				 * Due to the spinlock, we will never get out of this
-				 * loop without a proper wait. (DB)
-				 */
-				scsi_sleep(1 * HZ);
+		/*
+		 * Locate any failed commands for this device.
+		 */
+		SCpnt = scsi_eh_find_failed_command(SDpnt);
+		if (SCpnt)
+			failed_command = SCpnt;
 
-				goto next_device2;
-			}
-			SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
+		/*
+		 * If this device has timed out or failed commands,
+		 * and uses the soft_reset option.
+		 */
+		if (SCpnt && SDpnt->soft_reset)
+			soft_reset = 1;
+	}
+
+	/* 
+	 * If this device uses the soft reset option, and this
+	 * is one of the devices acting up, then our only
+	 * option is to wait a bit, since the command is
+	 * supposedly still running.  
+	 *
+	 * FIXME(eric) - right now we will just end up falling
+	 * through to the 'take device offline' case.
+	 */
+	if (soft_reset) {
+		SCSI_LOG_ERROR_RECOVERY(3,
+			printk("scsi_unjam_host: unable to try "
+				"hard host reset\n"));
 
 			/*
-			 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
+			 * Due to the spinlock, we will never get out of this
+			 * loop without a proper wait. (DB)
 			 */
-			rtn = scsi_try_host_reset(SCpnt);
-			if (rtn == SUCCESS) {
-				/*
-				 * FIXME(eric) we assume that all commands are flushed from the
-				 * controller.  We should get a DID_RESET for all of the commands
-				 * that were pending.  We should ignore these so that we can
-				 * guarantee that we are in a consistent state.
-				 *
-				 * I believe this to be the case right now, but this needs to be
-				 * tested.
-				 */
-				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
-					rtn = scsi_eh_restart_device(SDloop, done);
-
-					if (rtn != SUCCESS)
-						scsi_eh_set_device_offline(SDloop, done, "not ready or command retry failed after host reset");
-				}
-			}
-			return;
+			scsi_sleep(1 * HZ);
+
+			goto try_again;
+	}
+
+	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
+
+	/*
+	 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
+	 */
+	rtn = scsi_try_host_reset(failed_command);
+	if (rtn == SUCCESS) {
+		/*
+		 * FIXME(eric) we assume that all commands are flushed from the
+		 * controller.  We should get a DID_RESET for all of the commands
+		 * that were pending.  We should ignore these so that we can
+		 * guarantee that we are in a consistent state.
+		 *
+		 * I believe this to be the case right now, but this needs to be
+		 * tested.
+		 */
+		for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
+			rtn = scsi_eh_restart_device(SDloop, done);
+
+			if (rtn != SUCCESS)
+				scsi_eh_set_device_offline(SDloop, done, "not ready or command retry failed after host reset");
 		}
 	}
 }