1 files changed, 298 insertions, 100 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
index 5fb44936b127..1900fd21add6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -206,17 +206,21 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
 
 /*
- * If a split block contains more than this many segments, consider it too
- * computationally expensive to check all (2^num_segments) possible
- * combinations. Instead, try at most 2^_segments_max randomly-selected
- * combinations.
- *
- * This is reasonable if only a few segment copies are damaged and the
- * majority of segment copies are good. This allows all the segment copies to
- * participate fairly in the reconstruction and prevents the repeated use of
- * one bad copy.
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed.  This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
+ */
+int zfs_reconstruct_indirect_combinations_max = 256;
+
+
+/*
+ * Enable to simulate damaged segments and validate reconstruction.
+ * Used by ztest
  */
-int zfs_reconstruct_indirect_segments_max = 10;
+unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
 
 /*
  * The indirect_child_t represents the vdev that we will read from, when we
@@ -228,6 +232,13 @@ int zfs_reconstruct_indirect_segments_max = 10;
 typedef struct indirect_child {
 	abd_t *ic_data;
 	vdev_t *ic_vdev;
+
+	/*
+	 * ic_duplicate is NULL when the ic_data contents are unique, when it
+	 * is determined to be a duplicate it references the primary child.
+	 */
+	struct indirect_child *ic_duplicate;
+	list_node_t ic_node; /* node on is_unique_child */
 } indirect_child_t;
 
 /*
@@ -249,12 +260,14 @@ typedef struct indirect_split {
 	uint64_t is_target_offset; /* offset on is_vdev */
 	uint64_t is_size;
 	int is_children; /* number of entries in is_child[] */
+	int is_unique_children; /* number of entries in is_unique_child */
+	list_t is_unique_child;
 
 	/*
 	 * is_good_child is the child that we are currently using to
 	 * attempt reconstruction.
 	 */
-	int is_good_child;
+	indirect_child_t *is_good_child;
 
 	indirect_child_t is_child[1]; /* variable-length */
 } indirect_split_t;
@@ -266,6 +279,9 @@ typedef struct indirect_split {
 typedef struct indirect_vsd {
 	boolean_t iv_split_block;
 	boolean_t iv_reconstruct;
+	uint64_t iv_unique_combinations;
+	uint64_t iv_attempts;
+	uint64_t iv_attempts_max;
 
 	list_t iv_splits; /* list of indirect_split_t's */
 } indirect_vsd_t;
@@ -283,6 +299,13 @@ vdev_indirect_map_free(zio_t *zio)
 				abd_free(ic->ic_data);
 		}
 		list_remove(&iv->iv_splits, is);
+
+		indirect_child_t *ic;
+		while ((ic = list_head(&is->is_unique_child)) != NULL)
+			list_remove(&is->is_unique_child, ic);
+
+		list_destroy(&is->is_unique_child);
+
 		kmem_free(is,
 		    offsetof(indirect_split_t, is_child[is->is_children]));
 	}
@@ -1181,6 +1204,8 @@ vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
 	is->is_split_offset = split_offset;
 	is->is_target_offset = offset;
 	is->is_vdev = vd;
+	list_create(&is->is_unique_child, sizeof (indirect_child_t),
+	    offsetof(indirect_child_t, ic_node));
 
 	/*
 	 * Note that we only consider multiple copies of the data for
@@ -1191,6 +1216,7 @@ vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
 	if (vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < n; i++) {
 			is->is_child[i].ic_vdev = vd->vdev_child[i];
+			list_link_init(&is->is_child[i].ic_node);
 		}
 	} else {
 		is->is_child[0].ic_vdev = vd;
@@ -1243,6 +1269,7 @@ vdev_indirect_read_all(zio_t *zio)
 
 			ic->ic_data = abd_alloc_sametype(zio->io_abd,
 			    is->is_size);
+			ic->ic_duplicate = NULL;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
@@ -1364,7 +1391,7 @@ vdev_indirect_checksum_error(zio_t *zio,
 
 	zio_bad_cksum_t zbc = { 0 };
 	void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
-	abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+	abd_t *good_abd = is->is_good_child->ic_data;
 	void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
 	zfs_ereport_post_checksum(zio->io_spa, vd, zio,
 	    is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
@@ -1397,21 +1424,18 @@ vdev_indirect_repair(zio_t *zio)
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
-		indirect_child_t *good_child = &is->is_child[is->is_good_child];
-
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
-			if (ic == good_child)
+			if (ic == is->is_good_child)
 				continue;
 			if (ic->ic_data == NULL)
 				continue;
-			if (abd_cmp(good_child->ic_data, ic->ic_data,
-			    is->is_size) == 0)
+			if (ic->ic_duplicate == is->is_good_child)
 				continue;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    ic->ic_vdev, is->is_target_offset,
-			    good_child->ic_data, is->is_size,
+			    is->is_good_child->ic_data, is->is_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL));
@@ -1454,20 +1478,193 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
 }
 
 /*
+ * Copy data from all the splits to a main zio then validate the checksum.
+ * If then checksum is successfully validated return success.
+ */
+static int
+vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
+{
+	zio_bad_cksum_t zbc;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
+		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
+
+		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
+		    is->is_split_offset, 0, is->is_size);
+	}
+
+	return (zio_checksum_error(zio, &zbc));
+}
+
+/*
+ * There are relatively few possible combinations making it feasible to
+ * deterministically check them all.  We do this by setting the good_child
+ * to the next unique split version.  If we reach the end of the list then
+ * "carry over" to the next unique split version (like counting in base
+ * is_unique_children, but each digit can have a different base).
+ */
+static int
+vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
+{
+	boolean_t more = B_TRUE;
+
+	iv->iv_attempts = 0;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is))
+		is->is_good_child = list_head(&is->is_unique_child);
+
+	while (more == B_TRUE) {
+		iv->iv_attempts++;
+		more = B_FALSE;
+
+		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+			return (0);
+
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+			is->is_good_child = list_next(&is->is_unique_child,
+			    is->is_good_child);
+			if (is->is_good_child != NULL) {
+				more = B_TRUE;
+				break;
+			}
+
+			is->is_good_child = list_head(&is->is_unique_child);
+		}
+	}
+
+	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
+
+	return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * There are too many combinations to try all of them in a reasonable amount
+ * of time.  So try a fixed number of random combinations from the unique
+ * split versions, after which we'll consider the block unrecoverable.
+ */
+static int
+vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
+{
+	iv->iv_attempts = 0;
+
+	while (iv->iv_attempts < iv->iv_attempts_max) {
+		iv->iv_attempts++;
+
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+			indirect_child_t *ic = list_head(&is->is_unique_child);
+			int children = is->is_unique_children;
+
+			for (int i = spa_get_random(children); i > 0; i--)
+				ic = list_next(&is->is_unique_child, ic);
+
+			ASSERT3P(ic, !=, NULL);
+			is->is_good_child = ic;
+		}
+
+		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+			return (0);
+	}
+
+	return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * This is a validation function for reconstruction.  It randomly selects
+ * a good combination, if one can be found, and then it intentionally
+ * damages all other segment copes by zeroing them.  This forces the
+ * reconstruction algorithm to locate the one remaining known good copy.
+ */
+static int
+vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
+{
+	/* Presume all the copies are unique for initial selection. */
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		is->is_unique_children = 0;
+
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic = &is->is_child[i];
+			if (ic->ic_data != NULL) {
+				is->is_unique_children++;
+				list_insert_tail(&is->is_unique_child, ic);
+			}
+		}
+	}
+
+	/*
+	 * Set each is_good_child to a randomly-selected child which
+	 * is known to contain validated data.
+	 */
+	int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+	if (error)
+		goto out;
+
+	/*
+	 * Damage all but the known good copy by zeroing it.  This will
+	 * result in two or less unique copies per indirect_child_t.
+	 * Both may need to be checked in order to reconstruct the block.
+	 * Set iv->iv_attempts_max such that all unique combinations will
+	 * enumerated, but limit the damage to at most 16 indirect splits.
+	 */
+	iv->iv_attempts_max = 1;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+
+			if (ic == is->is_good_child)
+				continue;
+			if (ic->ic_data == NULL)
+				continue;
+
+			abd_zero(ic->ic_data, ic->ic_data->abd_size);
+		}
+
+		iv->iv_attempts_max *= 2;
+		if (iv->iv_attempts_max > (1ULL << 16)) {
+			iv->iv_attempts_max = UINT64_MAX;
+			break;
+		}
+	}
+
+out:
+	/* Empty the unique children lists so they can be reconstructed. */
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		indirect_child_t *ic;
+		while ((ic = list_head(&is->is_unique_child)) != NULL)
+			list_remove(&is->is_unique_child, ic);
+
+		is->is_unique_children = 0;
+	}
+
+	return (error);
+}
+
+/*
  * This function is called when we have read all copies of the data and need
  * to try to find a combination of copies that gives us the right checksum.
  *
  * If we pointed to any mirror vdevs, this effectively does the job of the
  * mirror.  The mirror vdev code can't do its own job because we don't know
- * the checksum of each split segment individually.  We have to try every
- * combination of copies of split segments, until we find one that checksums
- * correctly.  (Or until we have tried all combinations, or have tried
- * 2^zfs_reconstruct_indirect_segments_max combinations.  In these cases we
- * set io_error to ECKSUM to propagate the error up to the user.)
+ * the checksum of each split segment individually.
  *
- * For example, if we have 3 segments in the split,
- * and each points to a 2-way mirror, we will have the following pieces of
- * data:
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly.  Duplicate segment copies are first
+ * identified and latter skipped during reconstruction.  This optimization
+ * reduces the search space and ensures that of the remaining combinations
+ * at most one is correct.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
  *
  *       |     mirror child
  * split |     [0]        [1]
@@ -1494,10 +1691,10 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
  * data_A_1 data_B_1 data_C_1
  *
  * Note that the split segments may be on the same or different top-level
- * vdevs. In either case, we try lots of combinations (see
- * zfs_reconstruct_indirect_segments_max).  This ensures that if a mirror has
- * small silent errors on all of its children, we can still reconstruct the
- * correct data, as long as those errors are at sufficiently-separated
+ * vdevs. In either case, we may need to try lots of combinations (see
+ * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
+ * has small silent errors on all of its children, we can still reconstruct
+ * the correct data, as long as those errors are at sufficiently-separated
  * offsets (specifically, separated by the largest block size - default of
  * 128KB, but up to 16MB).
  */
@@ -1505,90 +1702,91 @@ static void
 vdev_indirect_reconstruct_io_done(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
-	uint64_t attempts = 0;
-	uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
-	int segments = 0;
+	boolean_t known_good = B_FALSE;
+	int error;
+
+	iv->iv_unique_combinations = 1;
+	iv->iv_attempts_max = UINT64_MAX;
+
+	if (zfs_reconstruct_indirect_combinations_max > 0)
+		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
+
+	/*
+	 * If nonzero, every 1/x blocks will be damaged, in order to validate
+	 * reconstruction when there are split segments with damaged copies.
+	 * Known_good will TRUE when reconstruction is known to be possible.
+	 */
+	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
+	    spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
+		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
 
+	/*
+	 * Determine the unique children for a split segment and add them
+	 * to the is_unique_child list.  By restricting reconstruction
+	 * to these children, only unique combinations will be considered.
+	 * This can vastly reduce the search space when there are a large
+	 * number of indirect splits.
+	 */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
-	    is != NULL; is = list_next(&iv->iv_splits, is))
-		segments++;
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		is->is_unique_children = 0;
 
-	for (;;) {
-		/* copy data from splits to main zio */
-		int ret;
-		for (indirect_split_t *is = list_head(&iv->iv_splits);
-		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic_i = &is->is_child[i];
 
-			/*
-			 * If this child failed, its ic_data will be NULL.
-			 * Skip this combination.
-			 */
-			if (is->is_child[is->is_good_child].ic_data == NULL) {
-				ret = EIO;
-				goto next;
-			}
+			if (ic_i->ic_data == NULL ||
+			    ic_i->ic_duplicate != NULL)
+				continue;
 
-			abd_copy_off(zio->io_abd,
-			    is->is_child[is->is_good_child].ic_data,
-			    is->is_split_offset, 0, is->is_size);
-		}
+			for (int j = i + 1; j < is->is_children; j++) {
+				indirect_child_t *ic_j = &is->is_child[j];
 
-		/* See if this checksum matches. */
-		zio_bad_cksum_t zbc;
-		ret = zio_checksum_error(zio, &zbc);
-		if (ret == 0) {
-			/* Found a matching checksum.  Issue repair i/os. */
-			vdev_indirect_repair(zio);
-			zio_checksum_verified(zio);
-			return;
-		}
+				if (ic_j->ic_data == NULL ||
+				    ic_j->ic_duplicate != NULL)
+					continue;
 
-		/*
-		 * Checksum failed; try a different combination of split
-		 * children.
-		 */
-		boolean_t more;
-next:
-		more = B_FALSE;
-		if (segments <= zfs_reconstruct_indirect_segments_max) {
-			/*
-			 * There are relatively few segments, so
-			 * deterministically check all combinations.  We do
-			 * this by by adding one to the first split's
-			 * good_child.  If it overflows, then "carry over" to
-			 * the next split (like counting in base is_children,
-			 * but each digit can have a different base).
-			 */
-			for (indirect_split_t *is = list_head(&iv->iv_splits);
-			    is != NULL; is = list_next(&iv->iv_splits, is)) {
-				is->is_good_child++;
-				if (is->is_good_child < is->is_children) {
-					more = B_TRUE;
-					break;
+				if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
+				    is->is_size) == 0) {
+					ic_j->ic_duplicate = ic_i;
 				}
-				is->is_good_child = 0;
 			}
-		} else if (++attempts < attempts_max) {
-			/*
-			 * There are too many combinations to try all of them
-			 * in a reasonable amount of time, so try a fixed
-			 * number of random combinations, after which we'll
-			 * consider the block unrecoverable.
-			 */
-			for (indirect_split_t *is = list_head(&iv->iv_splits);
-			    is != NULL; is = list_next(&iv->iv_splits, is)) {
-				is->is_good_child =
-				    spa_get_random(is->is_children);
-			}
-			more = B_TRUE;
+
+			is->is_unique_children++;
+			list_insert_tail(&is->is_unique_child, ic_i);
 		}
-		if (!more) {
-			/* All combinations failed. */
-			zio->io_error = ret;
+
+		/* Reconstruction is impossible, no valid children */
+		EQUIV(list_is_empty(&is->is_unique_child),
+		    is->is_unique_children == 0);
+		if (list_is_empty(&is->is_unique_child)) {
+			zio->io_error = EIO;
 			vdev_indirect_all_checksum_errors(zio);
 			zio_checksum_verified(zio);
 			return;
 		}
+
+		iv->iv_unique_combinations *= is->is_unique_children;
+	}
+
+	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
+		error = vdev_indirect_splits_enumerate_all(iv, zio);
+	else
+		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+
+	if (error != 0) {
+		/* All attempted combinations failed. */
+		ASSERT3B(known_good, ==, B_FALSE);
+		zio->io_error = error;
+		vdev_indirect_all_checksum_errors(zio);
+	} else {
+		/*
+		 * The checksum has been successfully validated.  Issue
+		 * repair I/Os to any copies of splits which don't match
+		 * the validated version.
+		 */
+		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
+		vdev_indirect_repair(zio);
+		zio_checksum_verified(zio);
 	}
 }