src - FreeBSD source tree

diff options


context:
space:
mode:

author	Alan Somers <asomers@FreeBSD.org>	2023-07-12 20:46:27 +0000
committer	Alan Somers <asomers@FreeBSD.org>	2023-11-29 14:51:11 +0000
commit	d565784a7ebaa59e26febdcfd4a60329786ea5f5 (patch)
tree	39f52825a2466ede269ddb3da85a2436f7eac015
parent	272a406042608da9bc3e67e41c6b7fc31d4166b8 (diff)
download	src-d565784a7ebaa59e26febdcfd4a60329786ea5f5.tar.gz src-d565784a7ebaa59e26febdcfd4a60329786ea5f5.zip

zfsd: fault disks that generate too many I/O delay events

If ZFS reports that a disk had at least 8 I/O operations over 60s that were each delayed by at least 30s (implying a queue depth > 4 or I/O aggregation, obviously), fault that disk. Disks that respond this slowly can degrade the entire system's performance. MFC after: 2 weeks Sponsored by: Axcient Reviewed by: delphij Differential Revision: https://reviews.freebsd.org/D42825

Diffstat

-rw-r--r--

cddl/usr.sbin/zfsd/case_file.cc

-rw-r--r--

cddl/usr.sbin/zfsd/case_file.h

-rw-r--r--

cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc

-rw-r--r--

tests/sys/cddl/zfs/tests/zfsd/Makefile

-rw-r--r--

tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh

-rwxr-xr-x

tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh

6 files changed, 220 insertions, 4 deletions

diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc
index 3728913b646b..39f89fbbf7c8 100644
--- a/cddl/usr.sbin/zfsd/case_file.cc
+++ b/cddl/usr.sbin/zfsd/case_file.cc

@@ -470,7 +470,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event)

consumed = true;

}

else if (event.Value("class") == "ereport.fs.zfs.io" ||

- event.Value("class") == "ereport.fs.zfs.checksum") {

+ event.Value("class") == "ereport.fs.zfs.checksum" ||

+ event.Value("class") == "ereport.fs.zfs.delay") {

m_tentativeEvents.push_front(event.DeepCopy());

RegisterCallout(event);

@@ -1171,6 +1172,13 @@ IsIOEvent(const Event* const event)

return ("ereport.fs.zfs.io" == event->Value("type"));

}

+/* Does the argument event refer to an IO delay? */

+static bool

+IsDelayEvent(const Event* const event)

+ return ("ereport.fs.zfs.delay" == event->Value("type"));

bool

CaseFile::ShouldDegrade() const

{

@@ -1181,8 +1189,14 @@ CaseFile::ShouldDegrade() const

bool

CaseFile::ShouldFault() const

{

- return (std::count_if(m_events.begin(), m_events.end(),

- IsIOEvent) > ZFS_DEGRADE_IO_COUNT);

+ bool should_fault_for_io, should_fault_for_delay;

+ should_fault_for_io = std::count_if(m_events.begin(), m_events.end(),

+ IsIOEvent) > ZFS_DEGRADE_IO_COUNT;

+ should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(),

+ IsDelayEvent) > ZFS_FAULT_DELAY_COUNT;

+ return (should_fault_for_io || should_fault_for_delay);

}

nvlist_t *

diff --git a/cddl/usr.sbin/zfsd/case_file.h b/cddl/usr.sbin/zfsd/case_file.h
index e6c7010af234..9566b1586ef5 100644
--- a/cddl/usr.sbin/zfsd/case_file.h
+++ b/cddl/usr.sbin/zfsd/case_file.h

@@ -242,7 +242,11 @@ protected:

* to transition a vdev from healthy to degraded

* status.

- ZFS_DEGRADE_IO_COUNT = 50

+ ZFS_DEGRADE_IO_COUNT = 50,

+ /**

+ * The number of delay errors on a vdev required to fault it

+ */

+ ZFS_FAULT_DELAY_COUNT = 8,

};

static CalloutFunc_t OnGracePeriodEnded;

diff --git a/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc b/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
index caeb077a3de8..d76abb54c9ed 100644
--- a/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc
+++ b/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc

@@ -520,6 +520,72 @@ TEST_F(CaseFileTest, PoolDestroy)

}

+ * A Vdev with a very large number of Delay errors should fault

+ * For performance reasons, RefreshVdevState should be called at most once

+ */

+TEST_F(CaseFileTest, VeryManyDelayErrors)

+ EXPECT_CALL(*m_caseFile, RefreshVdevState())

+ .Times(::testing::AtMost(1))

+ .WillRepeatedly(::testing::Return(true));

+ for(int i=0; i<100; i++) {

+ stringstream evStringStream;

+ evStringStream <<

+ "!system=ZFS "

+ "class=ereport.fs.zfs.delay "

+ "ena=12091638756982918145 "

+ "parent_guid=13237004955564865395 "

+ "parent_type=raidz "

+ "pool=testpool.4415 "

+ "pool_context=0 "

+ "pool_failmode=wait "

+ "pool_guid=456 "

+ "pool_state= 0"

+ "subsystem=ZFS "

+ "time=";

+ evStringStream << i << "0000000000000000 ";

+ evStringStream << "timestamp=" << i << " ";

+ evStringStream <<

+ "type=ereport.fs.zfs.delay "

+ "vdev_ashift=12 "

+ "vdev_cksum_errors=0 "

+ "vdev_complete_ts=948336226469 "

+ "vdev_delays=77 "

+ "vdev_delta_ts=123998485899 "

+ "vdev_guid=123 "

+ "vdev_path=/dev/da400 "

+ "vdev_read_errors=0 "

+ "vdev_spare_guids= "

+ "vdev_type=disk "

+ "vdev_write_errors=0 "

+ "zio_blkid=622 "

+ "zio_delay=31000041101 "

+ "zio_delta=123998485899 "

+ "zio_err=0 "

+ "zio_flags=1572992 "

+ "zio_level=-2 "

+ "zio_object=0 "

+ "zio_objset=37 "

+ "zio_offset=25598976 "

+ "zio_pipeline=48234496 "

+ "zio_priority=3 "

+ "zio_size=1024"

+ "zio_stage=33554432 "

+ "zio_timestamp=824337740570 ";

+ Event *event(Event::CreateEvent(*m_eventFactory,

+ evStringStream.str()));

+ ZfsEvent *zfs_event = static_cast<ZfsEvent*>(event);

+ EXPECT_TRUE(m_caseFile->ReEvaluate(*zfs_event));

+ delete event;

+ }

+ m_caseFile->SpliceEvents();

+ EXPECT_FALSE(m_caseFile->ShouldDegrade());

+ EXPECT_TRUE(m_caseFile->ShouldFault());

+/*

* A Vdev with a very large number of IO errors should fault

* For performance reasons, RefreshVdevState should be called at most once

diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile b/tests/sys/cddl/zfs/tests/zfsd/Makefile
index a1863a03196c..7d3f29a7359e 100644
--- a/tests/sys/cddl/zfs/tests/zfsd/Makefile
+++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile

@@ -21,6 +21,7 @@ ${PACKAGE}FILES+= zfsd_autoreplace_003_pos.ksh

${PACKAGE}FILES+= zfsd_degrade_001_pos.ksh

${PACKAGE}FILES+= zfsd_degrade_002_pos.ksh

${PACKAGE}FILES+= zfsd_fault_001_pos.ksh

+${PACKAGE}FILES+= zfsd_fault_002_pos.ksh

${PACKAGE}FILES+= zfsd_hotspare_001_pos.ksh

${PACKAGE}FILES+= zfsd_hotspare_002_pos.ksh

${PACKAGE}FILES+= zfsd_hotspare_003_pos.ksh

diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh
new file mode 100644
index 000000000000..d8de1ceaff4b
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh

@@ -0,0 +1,99 @@

+#!/usr/local/bin/ksh93 -p

+# CDDL HEADER START

+# The contents of this file are subject to the terms of the

+# Common Development and Distribution License (the "License").

+# You may not use this file except in compliance with the License.

+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

+# or http://www.opensolaris.org/os/licensing.

+# See the License for the specific language governing permissions

+# and limitations under the License.

+# When distributing Covered Code, include this CDDL HEADER in each

+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.

+# If applicable, add the following below this CDDL HEADER, with the

+# fields enclosed by brackets "[]" replaced with your own identifying

+# information: Portions Copyright [yyyy] [name of copyright owner]

+# CDDL HEADER END

+# Use is subject to license terms.

+# $FreeBSD$

+. $STF_SUITE/include/libtest.kshlib

+. $STF_SUITE/include/libgnop.kshlib

+################################################################################

+# __stc_assertion_start

+# ID: zfsd_fault_002_pos

+# DESCRIPTION:

+# If a vdev experiences delayed I/O, it will become faulted.

+# STRATEGY:

+# 1. Create a storage pool. Use gnop vdevs so we can inject I/O delays.

+# 2. Inject IO delays while doing IO to the pool.

+# 3. Verify that the vdev becomes FAULTED.

+# 4. ONLINE it and verify that it resilvers and joins the pool.

+# TESTABILITY: explicit

+# TEST_AUTOMATION_LEVEL: automated

+# __stc_assertion_end

+###############################################################################

+verify_runnable "global"

+log_assert "ZFS will fault a vdev that experiences delayed I/O"

+ensure_zfsd_running

+DISK0_NOP=${DISK0}.nop

+DISK1_NOP=${DISK1}.nop

+log_must create_gnops $DISK0 $DISK1

+for type in "raidz" "mirror"; do

+ log_note "Testing raid type $type"

+ # Create a pool on the supplied disks

+ create_pool $TESTPOOL $type "$DISK0_NOP" "$DISK1_NOP"

+ log_must $ZFS create $TESTPOOL/$TESTFS

+ # Cause some IO delays writing to the pool

+ while true; do

+ # ZFS currently considers an I/O to be "slow" if it's delayed

+ # for 30 seconds (zio_slow_io_ms).

+ log_must gnop configure -d 31000 -x 100 "$DISK1_NOP"

+ $DD if=/dev/zero bs=128k count=1 >> \

+ /$TESTPOOL/$TESTFS/$TESTFILE 2> /dev/null

+ $FSYNC /$TESTPOOL/$TESTFS/$TESTFILE

+ # Check to see if the pool is faulted yet

+ $ZPOOL status $TESTPOOL | grep -q 'state: DEGRADED'

+ if [ $? == 0 ]

+ then

+ log_note "$TESTPOOL got degraded"

+ $ZPOOL status -s $TESTPOOL

+ break

+ fi

+ done

+ log_must check_state $TESTPOOL $TMPDISK "FAULTED"

+ log_must gnop configure -x 0 "$DISK1_NOP"

+ destroy_pool $TESTPOOL

+ log_must $RM -rf /$TESTPOOL

+done

+log_pass

diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
index de9bd7bc94ed..fe4ac4866ed3 100755
--- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh

@@ -56,6 +56,37 @@ zfsd_fault_001_pos_cleanup()

}

+atf_test_case zfsd_fault_002_pos cleanup

+zfsd_fault_002_pos_head()

+ atf_set "descr" "ZFS will fault a vdev that experiences delayed I/O"

+ atf_set "require.progs" "ksh93 gnop zfs zpool zfsd"

+ atf_set "timeout" 300

+zfsd_fault_002_pos_body()

+ . $(atf_get_srcdir)/../../include/default.cfg

+ . $(atf_get_srcdir)/../hotspare/hotspare.kshlib

+ . $(atf_get_srcdir)/../hotspare/hotspare.cfg

+ . $(atf_get_srcdir)/zfsd.cfg

+ verify_disk_count "$DISKS" 2

+ verify_zfsd_running

+ ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"

+ ksh93 $(atf_get_srcdir)/zfsd_fault_002_pos.ksh

+ if [[ $? != 0 ]]; then

+ save_artifacts

+ atf_fail "Testcase failed"

+ fi

+zfsd_fault_002_pos_cleanup()

+ . $(atf_get_srcdir)/../../include/default.cfg

+ . $(atf_get_srcdir)/zfsd.cfg

+ ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"

atf_test_case zfsd_degrade_001_pos cleanup

zfsd_degrade_001_pos_head()

{

@@ -631,6 +662,7 @@ zfsd_import_001_pos_cleanup()

atf_init_test_cases()

{

atf_add_test_case zfsd_fault_001_pos

+ atf_add_test_case zfsd_fault_002_pos

atf_add_test_case zfsd_degrade_001_pos

atf_add_test_case zfsd_degrade_002_pos

atf_add_test_case zfsd_hotspare_001_pos