aboutsummaryrefslogtreecommitdiff
path: root/sys/cam/cam_periph.c
diff options
context:
space:
mode:
authorKenneth D. Merry <ken@FreeBSD.org>2001-03-27 05:45:52 +0000
committerKenneth D. Merry <ken@FreeBSD.org>2001-03-27 05:45:52 +0000
commit3393f8daa3b4b18786cd585c1a37aa16dd03d410 (patch)
tree27779a91fcb1ae1ec6575f47b3f6d89c118cc5f2 /sys/cam/cam_periph.c
parent110a013333aa74d6a402b03d3ff6cec4baf4a49b (diff)
downloadsrc-3393f8daa3b4b18786cd585c1a37aa16dd03d410.tar.gz
src-3393f8daa3b4b18786cd585c1a37aa16dd03d410.zip
Rewrite of the CAM error recovery code.
Some of the major changes include: - The SCSI error handling portion of cam_periph_error() has been broken out into a number of subfunctions to better modularize the code that handles the hierarchy of SCSI errors. As a result, the code is now much easier to read. - String handling and error printing has been significantly revamped. We now use sbufs to do string formatting instead of using printfs (for the kernel) and snprintf/strncat (for userland) as before. There is a new catchall error printing routine, cam_error_print() and its string-based counterpart, cam_error_string() that allow the kernel and userland applications to pass in a CCB and have errors printed out properly, whether or not they're SCSI errors. Among other things, this helped eliminate a fair amount of duplicate code in camcontrol. We now print out more information than before, including the CAM status and SCSI status and the error recovery action taken to remedy the problem. - sbufs are now available in userland, via libsbuf. This change was necessary since most of the error printing code is shared between libcam and the kernel. - A new transfer settings interface is included in this checkin. This code is #ifdef'ed out, and is primarily intended to aid discussion with HBA driver authors on the final form the interface should take. There is example code in the ahc(4) driver that implements the HBA driver side of the new interface. The new transfer settings code won't be enabled until we're ready to switch all HBA drivers over to the new interface. src/Makefile.inc1, lib/Makefile: Add libsbuf. It must be built before libcam, since libcam uses sbuf routines. libcam/Makefile: libcam now depends on libsbuf. libsbuf/Makefile: Add a makefile for libsbuf. This pulls in the sbuf sources from sys/kern. bsd.libnames.mk: Add LIBSBUF. camcontrol/Makefile: Add -lsbuf. Since camcontrol is statically linked, we can't depend on the dynamic linker to pull in libsbuf. camcontrol.c: Use cam_error_print() instead of checking for CAM_SCSI_STATUS_ERROR on every failed CCB. sbuf.9: Change the prototypes for sbuf_cat() and sbuf_cpy() so that the source string is now a const char *. This is more in line wth the standard system string functions, and helps eliminate warnings when dealing with a const source buffer. Fix a typo. cam.c: Add description strings for the various CAM error status values, as well as routines to look up those strings. Add new cam_error_string() and cam_error_print() routines for userland and the kernel. cam.h: Add a new CAM flag, CAM_RETRY_SELTO. Add enumerated types for the various options available with cam_error_print() and cam_error_string(). cam_ccb.h: Add new transfer negotiation structures/types. Change inq_len in the ccb_getdev structure to be "reserved". This field has never been filled in, and will be removed when we next bump the CAM version. cam_debug.h: Fix typo. cam_periph.c: Modularize cam_periph_error(). The SCSI error handling part of cam_periph_error() is now in camperiphscsistatuserror() and camperiphscsisenseerror(). In cam_periph_lock(), increase the reference count on the periph while we wait for our lock attempt to succeed so that the periph won't go away while we're sleeping. cam_xpt.c: Add new transfer negotiation code. (ifdefed out) Add a new function, xpt_path_string(). This is a string/sbuf analog to xpt_print_path(). scsi_all.c: Revamp string handing and error printing code. We now use sbufs for much of the string formatting code. More of that code is shared between userland the kernel. scsi_all.h: Get rid of SS_TURSTART, it wasn't terribly useful in the first place. Add a new error action, SS_REQSENSE. (Send a request sense and then retry the command.) This is useful when the controller hasn't performed autosense for some reason. Change the default actions around a bit. scsi_cd.c, scsi_da.c, scsi_pt.c, scsi_ses.c: SF_RETRY_SELTO -> CAM_RETRY_SELTO. Selection timeouts shouldn't be covered by a sense flag. scsi_pass.[ch]: SF_RETRY_SELTO -> CAM_RETRY_SELTO. Get rid of the last vestiges of a read/write interface. libkern/bsearch.c, sys/libkern.h, conf/files: Add bsearch.c, which is needed for some of the new table lookup routines. aic7xxx_freebsd.c: Define AHC_NEW_TRAN_SETTINGS if CAM_NEW_TRAN_CODE is defined. sbuf.h, subr_sbuf.c: Add the appropriate #ifdefs so sbufs can compile and run in userland. Change sbuf_printf() to use vsnprintf() instead of kvprintf(), which is only available in the kernel. Change the source string for sbuf_cpy() and sbuf_cat() to be a const char *. Add __BEGIN_DECLS and __END_DECLS around function prototypes since they're now exported to userland. kdump/mkioctls: Include stdio.h before cam.h since cam.h now includes a function with a FILE * argument. Submitted by: gibbs (mostly) Reviewed by: jdp, marcel (libsbuf makefile changes) Reviewed by: des (sbuf changes) Reviewed by: ken
Notes
Notes: svn path=/head/; revision=74840
Diffstat (limited to 'sys/cam/cam_periph.c')
-rw-r--r--sys/cam/cam_periph.c918
1 files changed, 445 insertions, 473 deletions
diff --git a/sys/cam/cam_periph.c b/sys/cam/cam_periph.c
index eba178710ced..ee6eace58326 100644
--- a/sys/cam/cam_periph.c
+++ b/sys/cam/cam_periph.c
@@ -62,6 +62,20 @@ static u_int camperiphunit(struct periph_driver *p_drv,
static void camperiphdone(struct cam_periph *periph,
union ccb *done_ccb);
static void camperiphfree(struct cam_periph *periph);
+static int camperiphscsistatuserror(union ccb *ccb,
+ cam_flags camflags,
+ u_int32_t sense_flags,
+ union ccb *save_ccb,
+ int *openings,
+ u_int32_t *relsim_flags,
+ u_int32_t *timeout);
+static int camperiphscsisenseerror(union ccb *ccb,
+ cam_flags camflags,
+ u_int32_t sense_flags,
+ union ccb *save_ccb,
+ int *openings,
+ u_int32_t *relsim_flags,
+ u_int32_t *timeout);
static int nperiph_drivers;
struct periph_driver **periph_drivers;
@@ -473,15 +487,23 @@ cam_periph_lock(struct cam_periph *periph, int priority)
{
int error;
+ /*
+ * Increment the reference count on the peripheral
+ * while we wait for our lock attempt to succeed
+ * to ensure the peripheral doesn't dissappear
+ * out from under us while we sleep.
+ */
+ if (cam_periph_acquire(periph) != CAM_REQ_CMP)
+ return(ENXIO);
+
while ((periph->flags & CAM_PERIPH_LOCKED) != 0) {
periph->flags |= CAM_PERIPH_LOCK_WANTED;
- if ((error = tsleep(periph, priority, "caplck", 0)) != 0)
+ if ((error = tsleep(periph, priority, "caplck", 0)) != 0) {
+ cam_periph_release(periph);
return error;
+ }
}
- if (cam_periph_acquire(periph) != CAM_REQ_CMP)
- return(ENXIO);
-
periph->flags |= CAM_PERIPH_LOCKED;
return 0;
}
@@ -891,13 +913,16 @@ cam_release_devq(struct cam_path *path, u_int32_t relsim_flags,
static void
camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
{
+ union ccb *saved_ccb;
cam_status status;
int frozen;
int sense;
struct scsi_start_stop_unit *scsi_cmd;
u_int32_t relsim_flags, timeout;
u_int32_t qfrozen_cnt;
+ int xpt_done_ccb;
+ xpt_done_ccb = FALSE;
status = done_ccb->ccb_h.status;
frozen = (status & CAM_DEV_QFRZN) != 0;
sense = (status & CAM_AUTOSNS_VALID) != 0;
@@ -905,6 +930,7 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
timeout = 0;
relsim_flags = 0;
+ saved_ccb = (union ccb *)done_ccb->ccb_h.saved_ccb_ptr;
/*
* Unfreeze the queue once if it is already frozen..
@@ -918,15 +944,19 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
}
switch (status) {
-
case CAM_REQ_CMP:
-
+ {
/*
* If we have successfully taken a device from the not
- * ready to ready state, re-scan the device and re-get the
- * inquiry information. Many devices (mostly disks) don't
- * properly report their inquiry information unless they
- * are spun up.
+ * ready to ready state, re-scan the device and re-get
+ * the inquiry information. Many devices (mostly disks)
+ * don't properly report their inquiry information unless
+ * they are spun up.
+ *
+ * If we manually retrieved sense into a CCB and got
+ * something other than "NO SENSE" send the updated CCB
+ * back to the client via xpt_done() to be processed via
+ * the error recovery code again.
*/
if (done_ccb->ccb_h.func_code == XPT_SCSI_IO) {
scsi_cmd = (struct scsi_start_stop_unit *)
@@ -935,15 +965,35 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
if (scsi_cmd->opcode == START_STOP_UNIT)
xpt_async(AC_INQ_CHANGED,
done_ccb->ccb_h.path, NULL);
+ if (scsi_cmd->opcode == REQUEST_SENSE) {
+ u_int sense_key;
+
+ sense_key = saved_ccb->csio.sense_data.flags;
+ sense_key &= SSD_KEY;
+ if (sense_key != SSD_KEY_NO_SENSE) {
+ saved_ccb->ccb_h.flags |=
+ CAM_AUTOSNS_VALID;
+ xpt_print_path(saved_ccb->ccb_h.path);
+ printf("Recovered Sense\n");
+#if 0
+ scsi_sense_print(&saved_ccb->csio);
+#endif
+ cam_error_print(saved_ccb, CAM_ESF_ALL,
+ CAM_EPF_ALL);
+ xpt_done_ccb = TRUE;
+ }
+ }
}
bcopy(done_ccb->ccb_h.saved_ccb_ptr, done_ccb,
sizeof(union ccb));
periph->flags &= ~CAM_PERIPH_RECOVERY_INPROG;
- xpt_action(done_ccb);
+ if (xpt_done_ccb == FALSE)
+ xpt_action(done_ccb);
break;
+ }
case CAM_SCSI_STATUS_ERROR:
scsi_cmd = (struct scsi_start_stop_unit *)
&done_ccb->csio.cdb_io.cdb_bytes;
@@ -982,7 +1032,7 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
xpt_action(done_ccb);
- } else if (done_ccb->ccb_h.retry_count > 0) {
+ } else if (done_ccb->ccb_h.retry_count > 1) {
/*
* In this case, the error recovery
* command failed, but we've got
@@ -1001,8 +1051,9 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
} else {
/*
- * Copy the original CCB back and
- * send it back to the caller.
+ * Perform the final retry with the original
+ * CCB so that final error processing is
+ * performed by the owner of the CCB.
*/
bcopy(done_ccb->ccb_h.saved_ccb_ptr,
done_ccb, sizeof(union ccb));
@@ -1039,6 +1090,13 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
}
/* decrement the retry count */
+ /*
+ * XXX This isn't appropriate in all cases. Restructure,
+ * so that the retry count is only decremented on an
+ * actual retry. Remeber that the orignal ccb had its
+ * retry count dropped before entering recovery, so
+ * doing it again is a bug.
+ */
if (done_ccb->ccb_h.retry_count > 0)
done_ccb->ccb_h.retry_count--;
@@ -1047,6 +1105,8 @@ camperiphdone(struct cam_periph *periph, union ccb *done_ccb)
/*openings*/0,
/*timeout*/timeout,
/*getcount_only*/0);
+ if (xpt_done_ccb == TRUE)
+ (*done_ccb->ccb_h.cbfcnp)(periph, done_ccb);
}
/*
@@ -1113,469 +1173,370 @@ cam_periph_freeze_after_event(struct cam_periph *periph,
}
-/*
- * Generic error handler. Peripheral drivers usually filter
- * out the errors that they handle in a unique mannor, then
- * call this function.
- */
-int
-cam_periph_error(union ccb *ccb, cam_flags camflags,
- u_int32_t sense_flags, union ccb *save_ccb)
+static int
+camperiphscsistatuserror(union ccb *ccb, cam_flags camflags,
+ u_int32_t sense_flags, union ccb *save_ccb,
+ int *openings, u_int32_t *relsim_flags,
+ u_int32_t *timeout)
{
- cam_status status;
- int frozen;
- int sense;
- int error;
- int openings;
- int retry;
- u_int32_t relsim_flags;
- u_int32_t timeout;
-
- status = ccb->ccb_h.status;
- frozen = (status & CAM_DEV_QFRZN) != 0;
- sense = (status & CAM_AUTOSNS_VALID) != 0;
- status &= CAM_STATUS_MASK;
- relsim_flags = 0;
+ int error;
- switch (status) {
- case CAM_REQ_CMP:
- /* decrement the number of retries */
- retry = ccb->ccb_h.retry_count > 0;
- if (retry)
- ccb->ccb_h.retry_count--;
+ switch (ccb->csio.scsi_status) {
+ case SCSI_STATUS_OK:
+ case SCSI_STATUS_COND_MET:
+ case SCSI_STATUS_INTERMED:
+ case SCSI_STATUS_INTERMED_COND_MET:
error = 0;
break;
- case CAM_AUTOSENSE_FAIL:
- case CAM_SCSI_STATUS_ERROR:
+ case SCSI_STATUS_CMD_TERMINATED:
+ case SCSI_STATUS_CHECK_COND:
+ error = camperiphscsisenseerror(ccb,
+ camflags,
+ sense_flags,
+ save_ccb,
+ openings,
+ relsim_flags,
+ timeout);
+ break;
+ case SCSI_STATUS_QUEUE_FULL:
+ {
+ /* no decrement */
+ struct ccb_getdevstats cgds;
- switch (ccb->csio.scsi_status) {
- case SCSI_STATUS_OK:
- case SCSI_STATUS_COND_MET:
- case SCSI_STATUS_INTERMED:
- case SCSI_STATUS_INTERMED_COND_MET:
- error = 0;
- break;
- case SCSI_STATUS_CMD_TERMINATED:
- case SCSI_STATUS_CHECK_COND:
- if (sense != 0) {
- struct scsi_sense_data *sense;
- int error_code, sense_key, asc, ascq;
- struct cam_periph *periph;
- scsi_sense_action err_action;
- struct ccb_getdev cgd;
-
- sense = &ccb->csio.sense_data;
- scsi_extract_sense(sense, &error_code,
- &sense_key, &asc, &ascq);
- periph = xpt_path_periph(ccb->ccb_h.path);
+ /*
+ * First off, find out what the current
+ * transaction counts are.
+ */
+ xpt_setup_ccb(&cgds.ccb_h,
+ ccb->ccb_h.path,
+ /*priority*/1);
+ cgds.ccb_h.func_code = XPT_GDEV_STATS;
+ xpt_action((union ccb *)&cgds);
+
+ /*
+ * If we were the only transaction active, treat
+ * the QUEUE FULL as if it were a BUSY condition.
+ */
+ if (cgds.dev_active != 0) {
+ int total_openings;
+ /*
+ * Reduce the number of openings to
+ * be 1 less than the amount it took
+ * to get a queue full bounded by the
+ * minimum allowed tag count for this
+ * device.
+ */
+ total_openings = cgds.dev_active + cgds.dev_openings;
+ *openings = cgds.dev_active;
+ if (*openings < cgds.mintags)
+ *openings = cgds.mintags;
+ if (*openings < total_openings)
+ *relsim_flags = RELSIM_ADJUST_OPENINGS;
+ else {
/*
- * Grab the inquiry data for this device.
+ * Some devices report queue full for
+ * temporary resource shortages. For
+ * this reason, we allow a minimum
+ * tag count to be entered via a
+ * quirk entry to prevent the queue
+ * count on these devices from falling
+ * to a pessimisticly low value. We
+ * still wait for the next successful
+ * completion, however, before queueing
+ * more transactions to the device.
*/
- xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path,
- /*priority*/ 1);
- cgd.ccb_h.func_code = XPT_GDEV_TYPE;
- xpt_action((union ccb *)&cgd);
+ *relsim_flags = RELSIM_RELEASE_AFTER_CMDCMPLT;
+ }
+ *timeout = 0;
+ error = ERESTART;
+ break;
+ }
+ /* FALLTHROUGH */
+ }
+ case SCSI_STATUS_BUSY:
+ /*
+ * Restart the queue after either another
+ * command completes or a 1 second timeout.
+ */
+ if (ccb->ccb_h.retry_count > 0) {
+ ccb->ccb_h.retry_count--;
+ error = ERESTART;
+ *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT
+ | RELSIM_RELEASE_AFTER_CMDCMPLT;
+ *timeout = 1000;
+ } else {
+ error = EIO;
+ }
+ break;
+ case SCSI_STATUS_RESERV_CONFLICT:
+ error = EIO;
+ break;
+ default:
+ error = EIO;
+ break;
+ }
+ return (error);
+}
- err_action = scsi_error_action(asc, ascq,
- &cgd.inq_data);
+static int
+camperiphscsisenseerror(union ccb *ccb, cam_flags camflags,
+ u_int32_t sense_flags, union ccb *save_ccb,
+ int *openings, u_int32_t *relsim_flags,
+ u_int32_t *timeout)
+{
+ struct cam_periph *periph;
+ int error;
- /*
- * Send a Test Unit Ready to the device.
- * If the 'many' flag is set, we send 120
- * test unit ready commands, one every half
- * second. Otherwise, we just send one TUR.
- * We only want to do this if the retry
- * count has not been exhausted.
- */
- if (((err_action & SS_MASK) == SS_TUR)
- && save_ccb != NULL
- && ccb->ccb_h.retry_count > 0) {
-
- /*
- * Since error recovery is already
- * in progress, don't attempt to
- * process this error. It is probably
- * related to the error that caused
- * the currently active error recovery
- * action. Also, we only have
- * space for one saved CCB, so if we
- * had two concurrent error recovery
- * actions, we would end up
- * over-writing one error recovery
- * CCB with another one.
- */
- if (periph->flags &
- CAM_PERIPH_RECOVERY_INPROG) {
- error = ERESTART;
- break;
- }
-
- periph->flags |=
- CAM_PERIPH_RECOVERY_INPROG;
-
- /* decrement the number of retries */
- if ((err_action &
- SSQ_DECREMENT_COUNT) != 0) {
- retry = 1;
- ccb->ccb_h.retry_count--;
- }
-
- bcopy(ccb, save_ccb, sizeof(*save_ccb));
-
- /*
- * We retry this one every half
- * second for a minute. If the
- * device hasn't become ready in a
- * minute's time, it's unlikely to
- * ever become ready. If the table
- * doesn't specify SSQ_MANY, we can
- * only try this once. Oh well.
- */
- if ((err_action & SSQ_MANY) != 0)
- scsi_test_unit_ready(&ccb->csio,
- /*retries*/120,
- camperiphdone,
- MSG_SIMPLE_Q_TAG,
- SSD_FULL_SIZE,
- /*timeout*/5000);
- else
- scsi_test_unit_ready(&ccb->csio,
- /*retries*/1,
- camperiphdone,
- MSG_SIMPLE_Q_TAG,
- SSD_FULL_SIZE,
- /*timeout*/5000);
-
- /* release the queue after .5 sec. */
- relsim_flags =
- RELSIM_RELEASE_AFTER_TIMEOUT;
- timeout = 500;
- /*
- * Drop the priority to 0 so that
- * we are the first to execute. Also
- * freeze the queue after this command
- * is sent so that we can restore the
- * old csio and have it queued in the
- * proper order before we let normal
- * transactions go to the drive.
- */
- ccb->ccb_h.pinfo.priority = 0;
- ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
-
- /*
- * Save a pointer to the original
- * CCB in the new CCB.
- */
- ccb->ccb_h.saved_ccb_ptr = save_ccb;
-
- error = ERESTART;
- }
- /*
- * Send a start unit command to the device,
- * and then retry the command. We only
- * want to do this if the retry count has
- * not been exhausted. If the user
- * specified 0 retries, then we follow
- * their request and do not retry.
- */
- else if (((err_action & SS_MASK) == SS_START)
- && save_ccb != NULL
- && ccb->ccb_h.retry_count > 0) {
- int le;
-
- /*
- * Only one error recovery action
- * at a time. See above.
- */
- if (periph->flags &
- CAM_PERIPH_RECOVERY_INPROG) {
- error = ERESTART;
- break;
- }
-
- periph->flags |=
- CAM_PERIPH_RECOVERY_INPROG;
-
- /* decrement the number of retries */
- retry = 1;
- ccb->ccb_h.retry_count--;
-
- /*
- * Check for removable media and
- * set load/eject flag
- * appropriately.
- */
- if (SID_IS_REMOVABLE(&cgd.inq_data))
- le = TRUE;
- else
- le = FALSE;
-
- /*
- * Attempt to start the drive up.
- *
- * Save the current ccb so it can
- * be restored and retried once the
- * drive is started up.
- */
- bcopy(ccb, save_ccb, sizeof(*save_ccb));
-
- scsi_start_stop(&ccb->csio,
- /*retries*/1,
- camperiphdone,
- MSG_SIMPLE_Q_TAG,
- /*start*/TRUE,
- /*load/eject*/le,
- /*immediate*/FALSE,
- SSD_FULL_SIZE,
- /*timeout*/50000);
- /*
- * Drop the priority to 0 so that
- * we are the first to execute. Also
- * freeze the queue after this command
- * is sent so that we can restore the
- * old csio and have it queued in the
- * proper order before we let normal
- * transactions go to the drive.
- */
- ccb->ccb_h.pinfo.priority = 0;
- ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
-
- /*
- * Save a pointer to the original
- * CCB in the new CCB.
- */
- ccb->ccb_h.saved_ccb_ptr = save_ccb;
-
- error = ERESTART;
- } else if ((sense_flags & SF_RETRY_UA) != 0) {
- /*
- * XXX KDM this is a *horrible*
- * hack.
- */
- error = scsi_interpret_sense(ccb,
- sense_flags,
- &relsim_flags,
- &openings,
- &timeout,
- err_action);
- }
+ periph = xpt_path_periph(ccb->ccb_h.path);
+ if (periph->flags & CAM_PERIPH_RECOVERY_INPROG) {
- /*
- * Theoretically, this code should send a
- * test unit ready to the given device, and
- * if it returns and error, send a start
- * unit command. Since we don't yet have
- * the capability to do two-command error
- * recovery, just send a start unit.
- * XXX KDM fix this!
- */
- else if (((err_action & SS_MASK) == SS_TURSTART)
- && save_ccb != NULL
- && ccb->ccb_h.retry_count > 0) {
- int le;
-
- /*
- * Only one error recovery action
- * at a time. See above.
- */
- if (periph->flags &
- CAM_PERIPH_RECOVERY_INPROG) {
- error = ERESTART;
- break;
- }
-
- periph->flags |=
- CAM_PERIPH_RECOVERY_INPROG;
-
- /* decrement the number of retries */
- retry = 1;
- ccb->ccb_h.retry_count--;
-
- /*
- * Check for removable media and
- * set load/eject flag
- * appropriately.
- */
- if (SID_IS_REMOVABLE(&cgd.inq_data))
- le = TRUE;
- else
- le = FALSE;
-
- /*
- * Attempt to start the drive up.
- *
- * Save the current ccb so it can
- * be restored and retried once the
- * drive is started up.
- */
- bcopy(ccb, save_ccb, sizeof(*save_ccb));
-
- scsi_start_stop(&ccb->csio,
- /*retries*/1,
- camperiphdone,
- MSG_SIMPLE_Q_TAG,
- /*start*/TRUE,
- /*load/eject*/le,
- /*immediate*/FALSE,
- SSD_FULL_SIZE,
- /*timeout*/50000);
-
- /* release the queue after .5 sec. */
- relsim_flags =
- RELSIM_RELEASE_AFTER_TIMEOUT;
- timeout = 500;
- /*
- * Drop the priority to 0 so that
- * we are the first to execute. Also
- * freeze the queue after this command
- * is sent so that we can restore the
- * old csio and have it queued in the
- * proper order before we let normal
- * transactions go to the drive.
- */
- ccb->ccb_h.pinfo.priority = 0;
- ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
-
- /*
- * Save a pointer to the original
- * CCB in the new CCB.
- */
- ccb->ccb_h.saved_ccb_ptr = save_ccb;
-
- error = ERESTART;
- } else {
- error = scsi_interpret_sense(ccb,
- sense_flags,
- &relsim_flags,
- &openings,
- &timeout,
- err_action);
- }
- } else if (ccb->csio.scsi_status ==
- SCSI_STATUS_CHECK_COND
- && status != CAM_AUTOSENSE_FAIL) {
- /* no point in decrementing the retry count */
- panic("cam_periph_error: scsi status of "
- "CHECK COND returned but no sense "
- "information is availible. "
- "Controller should have returned "
- "CAM_AUTOSENSE_FAILED");
- /* NOTREACHED */
- error = EIO;
- } else if (ccb->ccb_h.retry_count == 0) {
- /*
- * XXX KDM shouldn't there be a better
- * argument to return??
- */
- error = EIO;
- } else {
- /* decrement the number of retries */
- retry = ccb->ccb_h.retry_count > 0;
- if (retry)
- ccb->ccb_h.retry_count--;
- /*
- * If it was aborted with no
- * clue as to the reason, just
- * retry it again.
- */
- error = ERESTART;
+ /*
+ * If error recovery is already in progress, don't attempt
+ * to process this error, but requeue it unconditionally
+ * and attempt to process it once error recovery has
+ * completed. This failed command is probably related to
+ * the error that caused the currently active error recovery
+ * action so our current recovery efforts should also
+ * address this command. Be aware that the error recovery
+ * code assumes that only one recovery action is in progress
+ * on a particular peripheral instance at any given time
+ * (e.g. only one saved CCB for error recovery) so it is
+ * imperitive that we don't violate this assumption.
+ */
+ error = ERESTART;
+ } else {
+ scsi_sense_action err_action;
+ struct ccb_getdev cgd;
+ const char *action_string;
+ union ccb* print_ccb;
+
+ /* A description of the error recovery action performed */
+ action_string = NULL;
+
+ /*
+ * The location of the orignal ccb
+ * for sense printing purposes.
+ */
+ print_ccb = ccb;
+
+ /*
+ * Grab the inquiry data for this device.
+ */
+ xpt_setup_ccb(&cgd.ccb_h, ccb->ccb_h.path, /*priority*/ 1);
+ cgd.ccb_h.func_code = XPT_GDEV_TYPE;
+ xpt_action((union ccb *)&cgd);
+
+ if ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)
+ err_action = scsi_error_action(&ccb->csio,
+ &cgd.inq_data,
+ sense_flags);
+ else if ((ccb->ccb_h.flags & CAM_DIS_AUTOSENSE) == 0)
+ err_action = SS_REQSENSE;
+ else
+ err_action = SS_RETRY|SSQ_DECREMENT_COUNT|EIO;
+
+ error = err_action & SS_ERRMASK;
+
+ /*
+ * If the recovery action will consume a retry,
+ * make sure we actually have retries available.
+ */
+ if ((err_action & SSQ_DECREMENT_COUNT) != 0) {
+ if (ccb->ccb_h.retry_count > 0)
+ ccb->ccb_h.retry_count--;
+ else {
+ action_string = "Retries Exhausted";
+ goto sense_error_done;
+ }
+ }
+
+ if ((err_action & SS_MASK) >= SS_START) {
+ /*
+ * Do common portions of commands that
+ * use recovery CCBs.
+ */
+ if (save_ccb == NULL) {
+ action_string = "No recovery CCB supplied";
+ goto sense_error_done;
}
+ bcopy(ccb, save_ccb, sizeof(*save_ccb));
+ print_ccb = save_ccb;
+ periph->flags |= CAM_PERIPH_RECOVERY_INPROG;
+ }
+
+ switch (err_action & SS_MASK) {
+ case SS_NOP:
+ case SS_RETRY:
+ action_string = "Retrying Command";
+ error = ERESTART;
break;
- case SCSI_STATUS_QUEUE_FULL:
+ case SS_FAIL:
+ action_string = "Unretryable error";
+ break;
+ case SS_START:
{
- /* no decrement */
- struct ccb_getdevstats cgds;
+ int le;
/*
- * First off, find out what the current
- * transaction counts are.
+ * Send a start unit command to the device, and
+ * then retry the command.
*/
- xpt_setup_ccb(&cgds.ccb_h,
- ccb->ccb_h.path,
- /*priority*/1);
- cgds.ccb_h.func_code = XPT_GDEV_STATS;
- xpt_action((union ccb *)&cgds);
+ action_string = "Attempting to Start Unit";
/*
- * If we were the only transaction active, treat
- * the QUEUE FULL as if it were a BUSY condition.
+ * Check for removable media and set
+ * load/eject flag appropriately.
*/
- if (cgds.dev_active != 0) {
- int total_openings;
-
- /*
- * Reduce the number of openings to
- * be 1 less than the amount it took
- * to get a queue full bounded by the
- * minimum allowed tag count for this
- * device.
- */
- total_openings =
- cgds.dev_active+cgds.dev_openings;
- openings = cgds.dev_active;
- if (openings < cgds.mintags)
- openings = cgds.mintags;
- if (openings < total_openings)
- relsim_flags = RELSIM_ADJUST_OPENINGS;
- else {
- /*
- * Some devices report queue full for
- * temporary resource shortages. For
- * this reason, we allow a minimum
- * tag count to be entered via a
- * quirk entry to prevent the queue
- * count on these devices from falling
- * to a pessimisticly low value. We
- * still wait for the next successful
- * completion, however, before queueing
- * more transactions to the device.
- */
- relsim_flags =
- RELSIM_RELEASE_AFTER_CMDCMPLT;
- }
- timeout = 0;
- error = ERESTART;
- break;
- }
- /* FALLTHROUGH */
+ if (SID_IS_REMOVABLE(&cgd.inq_data))
+ le = TRUE;
+ else
+ le = FALSE;
+
+ scsi_start_stop(&ccb->csio,
+ /*retries*/1,
+ camperiphdone,
+ MSG_SIMPLE_Q_TAG,
+ /*start*/TRUE,
+ /*load/eject*/le,
+ /*immediate*/FALSE,
+ SSD_FULL_SIZE,
+ /*timeout*/50000);
+ break;
}
- case SCSI_STATUS_BUSY:
+ case SS_TUR:
+ {
/*
- * Restart the queue after either another
- * command completes or a 1 second timeout.
- * If we have any retries left, that is.
+ * Send a Test Unit Ready to the device.
+ * If the 'many' flag is set, we send 120
+ * test unit ready commands, one every half
+ * second. Otherwise, we just send one TUR.
+ * We only want to do this if the retry
+ * count has not been exhausted.
*/
- retry = ccb->ccb_h.retry_count > 0;
- if (retry) {
- ccb->ccb_h.retry_count--;
- error = ERESTART;
- relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT
- | RELSIM_RELEASE_AFTER_CMDCMPLT;
- timeout = 1000;
+ int retries;
+
+ if ((err_action & SSQ_MANY) != 0) {
+ action_string = "Polling device for readiness";
+ retries = 120;
} else {
- error = EIO;
+ action_string = "Testing device for readiness";
+ retries = 1;
}
+ scsi_test_unit_ready(&ccb->csio,
+ retries,
+ camperiphdone,
+ MSG_SIMPLE_Q_TAG,
+ SSD_FULL_SIZE,
+ /*timeout*/5000);
+
+ /*
+ * Accomplish our 500ms delay by deferring
+ * the release of our device queue appropriately.
+ */
+ *relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
+ *timeout = 500;
break;
- case SCSI_STATUS_RESERV_CONFLICT:
- error = EIO;
+ }
+ case SS_REQSENSE:
+ {
+ /*
+ * Send a Request Sense to the device. We
+ * assume that we are in a contingent allegiance
+ * condition so we do not tag this request.
+ */
+ scsi_request_sense(&ccb->csio, /*retries*/1,
+ camperiphdone,
+ &save_ccb->csio.sense_data,
+ sizeof(save_ccb->csio.sense_data),
+ CAM_TAG_ACTION_NONE,
+ /*sense_len*/SSD_FULL_SIZE,
+ /*timeout*/5000);
break;
+ }
default:
- error = EIO;
- break;
+ panic("Unhandled error action %x\n", err_action);
+ }
+
+ if ((err_action & SS_MASK) >= SS_START) {
+ /*
+ * Drop the priority to 0 so that the recovery
+ * CCB is the first to execute. Freeze the queue
+ * after this command is sent so that we can
+ * restore the old csio and have it queued in
+ * the proper order before we release normal
+ * transactions to the device.
+ */
+ ccb->ccb_h.pinfo.priority = 0;
+ ccb->ccb_h.flags |= CAM_DEV_QFREEZE;
+ ccb->ccb_h.saved_ccb_ptr = save_ccb;
+ error = ERESTART;
}
+
+sense_error_done:
+ if ((err_action & SSQ_PRINT_SENSE) != 0
+ && (ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0) {
+#if 0
+ scsi_sense_print(&print_ccb->csio);
+#endif
+ cam_error_print(print_ccb, CAM_ESF_ALL, CAM_EPF_ALL);
+ xpt_print_path(ccb->ccb_h.path);
+ printf("%s\n", action_string);
+ }
+ }
+ return (error);
+}
+
+/*
+ * Generic error handler. Peripheral drivers usually filter
+ * out the errors that they handle in a unique mannor, then
+ * call this function.
+ */
+int
+cam_periph_error(union ccb *ccb, cam_flags camflags,
+ u_int32_t sense_flags, union ccb *save_ccb)
+{
+ const char *action_string;
+ cam_status status;
+ int frozen;
+ int error;
+ int openings;
+ u_int32_t relsim_flags;
+ u_int32_t timeout;
+
+ action_string = NULL;
+ status = ccb->ccb_h.status;
+ frozen = (status & CAM_DEV_QFRZN) != 0;
+ status &= CAM_STATUS_MASK;
+ relsim_flags = 0;
+
+ switch (status) {
+ case CAM_REQ_CMP:
+ error = 0;
break;
+ case CAM_SCSI_STATUS_ERROR:
+ error = camperiphscsistatuserror(ccb,
+ camflags,
+ sense_flags,
+ save_ccb,
+ &openings,
+ &relsim_flags,
+ &timeout);
+ break;
+ case CAM_AUTOSENSE_FAIL:
+ xpt_print_path(ccb->ccb_h.path);
+ printf("AutoSense Failed\n");
case CAM_REQ_CMP_ERR:
case CAM_CMD_TIMEOUT:
case CAM_UNEXP_BUSFREE:
case CAM_UNCOR_PARITY:
case CAM_DATA_RUN_ERR:
/* decrement the number of retries */
- retry = ccb->ccb_h.retry_count > 0;
- if (retry) {
+ if (ccb->ccb_h.retry_count > 0) {
ccb->ccb_h.retry_count--;
error = ERESTART;
} else {
+ action_string = "Retries Exausted";
error = EIO;
}
break;
@@ -1587,46 +1548,37 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
break;
case CAM_SEL_TIMEOUT:
{
- /*
- * XXX
- * A single selection timeout should not be enough
- * to invalidate a device. We should retry for multiple
- * seconds assuming this isn't a probe. We'll probably
- * need a special flag for that.
- */
-#if 0
struct cam_path *newpath;
+ if ((camflags & CAM_RETRY_SELTO) != 0) {
+ if (ccb->ccb_h.retry_count > 0) {
+
+ ccb->ccb_h.retry_count--;
+ error = ERESTART;
+
+ /*
+ * Wait a second to give the device
+ * time to recover before we try again.
+ */
+ relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
+ timeout = 1000;
+ break;
+ }
+ }
+ error = ENXIO;
/* Should we do more if we can't create the path?? */
if (xpt_create_path(&newpath, xpt_path_periph(ccb->ccb_h.path),
xpt_path_path_id(ccb->ccb_h.path),
xpt_path_target_id(ccb->ccb_h.path),
CAM_LUN_WILDCARD) != CAM_REQ_CMP)
break;
+
/*
* Let peripheral drivers know that this device has gone
* away.
*/
xpt_async(AC_LOST_DEVICE, newpath, NULL);
xpt_free_path(newpath);
-#endif
- if ((sense_flags & SF_RETRY_SELTO) != 0) {
- retry = ccb->ccb_h.retry_count > 0;
- if (retry) {
- ccb->ccb_h.retry_count--;
- error = ERESTART;
- /*
- * Wait half a second to give the device
- * time to recover before we try again.
- */
- relsim_flags = RELSIM_RELEASE_AFTER_TIMEOUT;
- timeout = 500;
- } else {
- error = ENXIO;
- }
- } else {
- error = ENXIO;
- }
break;
}
case CAM_REQ_INVALID:
@@ -1634,13 +1586,22 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
case CAM_DEV_NOT_THERE:
case CAM_NO_HBA:
case CAM_PROVIDE_FAIL:
- case CAM_REQ_TOO_BIG:
+ case CAM_REQ_TOO_BIG:
error = EINVAL;
break;
case CAM_SCSI_BUS_RESET:
- case CAM_BDR_SENT:
+ case CAM_BDR_SENT:
+ /*
+ * Commands that repeatedly timeout and cause these
+ * kinds of error recovery actions, should return
+ * CAM_CMD_TIMEOUT, which allows us to safely assume
+ * that this command was an innocent bystander to
+ * these events and should be unconditionally
+ * retried.
+ */
+ /* FALLTHROUGH */
case CAM_REQUEUE_REQ:
- /* Unconditional requeue, dammit */
+ /* Unconditional requeue */
error = ERESTART;
break;
case CAM_RESRC_UNAVAIL:
@@ -1648,13 +1609,12 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
/* timeout??? */
default:
/* decrement the number of retries */
- retry = ccb->ccb_h.retry_count > 0;
- if (retry) {
+ if (ccb->ccb_h.retry_count > 0) {
ccb->ccb_h.retry_count--;
error = ERESTART;
} else {
- /* Check the sense codes */
error = EIO;
+ action_string = "Retries Exhausted";
}
break;
}
@@ -1664,18 +1624,30 @@ cam_periph_error(union ccb *ccb, cam_flags camflags,
if (frozen != 0)
ccb->ccb_h.status &= ~CAM_DEV_QFRZN;
- if (error == ERESTART)
+ if (error == ERESTART) {
+ action_string = "Retrying Command";
xpt_action(ccb);
+ }
- if (frozen != 0) {
+ if (frozen != 0)
cam_release_devq(ccb->ccb_h.path,
relsim_flags,
openings,
timeout,
/*getcount_only*/0);
- }
}
+ if (error != 0 && bootverbose) {
+
+ if (action_string == NULL)
+ action_string = "Unretryable Error";
+ if (error != ERESTART) {
+ xpt_print_path(ccb->ccb_h.path);
+ printf("error %d\n", error);
+ }
+ xpt_print_path(ccb->ccb_h.path);
+ printf("%s\n", action_string);
+ }
return (error);
}